refactoring to prepare new RWI entry object

- moved all url and index(RWI) entries to index package
- better naming to distinguish RWI entries and URL entries


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2937 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 6412c926bc
commit bb7d4b5d5e

@ -55,8 +55,8 @@ import de.anomic.data.bookmarksDB;
import de.anomic.data.listManager; import de.anomic.data.listManager;
import de.anomic.data.bookmarksDB.Tag; import de.anomic.data.bookmarksDB.Tag;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
@ -147,10 +147,10 @@ public class Bookmarks {
bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.getBookmark(urlHash); bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.getBookmark(urlHash);
if (bookmark == null) { if (bookmark == null) {
// try to get the bookmark from the LURL database // try to get the bookmark from the LURL database
plasmaCrawlLURLEntry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null); indexURLEntry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null);
plasmaParserDocument document = null; plasmaParserDocument document = null;
if (urlentry != null) { if (urlentry != null) {
plasmaCrawlLURLEntry.Components comp = urlentry.comp(); indexURLEntry.Components comp = urlentry.comp();
document = switchboard.snippetCache.retrieveDocument(comp.url(), true); document = switchboard.snippetCache.retrieveDocument(comp.url(), true);
prop.put("mode_edit", 0); // create mode prop.put("mode_edit", 0); // create mode
prop.put("mode_url", comp.url().toNormalform()); prop.put("mode_url", comp.url().toNormalform());

@ -57,11 +57,11 @@ import java.util.TreeMap;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry; import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.plasma.urlPattern.plasmaURLPattern;
@ -161,7 +161,7 @@ public class IndexControl_p {
int i = 0; int i = 0;
urlx = new String[index.size()]; urlx = new String[index.size()];
while (en.hasNext()) { while (en.hasNext()) {
urlx[i++] = ((indexEntry) en.next()).urlHash(); urlx[i++] = ((indexRWIEntry) en.next()).urlHash();
} }
index = null; index = null;
} }
@ -218,7 +218,7 @@ public class IndexControl_p {
} }
if (post.containsKey("urlhashdelete")) { if (post.containsKey("urlhashdelete")) {
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null); indexURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
if (entry == null) { if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else { } else {
@ -263,10 +263,10 @@ public class IndexControl_p {
Iterator urlIter = index.entries(); Iterator urlIter = index.entries();
HashMap knownURLs = new HashMap(); HashMap knownURLs = new HashMap();
HashSet unknownURLEntries = new HashSet(); HashSet unknownURLEntries = new HashSet();
indexEntry iEntry; indexRWIEntry iEntry;
plasmaCrawlLURLEntry lurl; indexURLEntry lurl;
while (urlIter.hasNext()) { while (urlIter.hasNext()) {
iEntry = (indexEntry) urlIter.next(); iEntry = (indexRWIEntry) urlIter.next();
lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null); lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null);
if (lurl == null) { if (lurl == null) {
unknownURLEntries.add(iEntry.urlHash()); unknownURLEntries.add(iEntry.urlHash());
@ -320,7 +320,7 @@ public class IndexControl_p {
URL url = new URL(urlstring); URL url = new URL(urlstring);
urlhash = indexURL.urlHash(url); urlhash = indexURL.urlHash(url);
prop.put("urlhash", urlhash); prop.put("urlhash", urlhash);
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null); indexURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
if (entry == null) { if (entry == null) {
prop.put("urlstring", "unknown url: " + urlstring); prop.put("urlstring", "unknown url: " + urlstring);
prop.put("urlhash", ""); prop.put("urlhash", "");
@ -334,7 +334,7 @@ public class IndexControl_p {
} }
if (post.containsKey("urlhashsearch")) { if (post.containsKey("urlhashsearch")) {
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null); indexURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
if (entry == null) { if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash); prop.put("result", "No Entry for URL hash " + urlhash);
} else { } else {
@ -348,12 +348,12 @@ public class IndexControl_p {
try { try {
final Iterator entryIt = switchboard.urlPool.loadedURL.entries(true, true, urlhash); final Iterator entryIt = switchboard.urlPool.loadedURL.entries(true, true, urlhash);
StringBuffer result = new StringBuffer("Sequential List of URL-Hashes:<br>"); StringBuffer result = new StringBuffer("Sequential List of URL-Hashes:<br>");
plasmaCrawlLURLEntry entry; indexURLEntry entry;
int i = 0; int i = 0;
int rows = 0, cols = 0; int rows = 0, cols = 0;
prop.put("urlhashsimilar", 1); prop.put("urlhashsimilar", 1);
while (entryIt.hasNext() && i < 256) { while (entryIt.hasNext() && i < 256) {
entry = (plasmaCrawlLURLEntry) entryIt.next(); entry = (indexURLEntry) entryIt.next();
prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", entry.hash()); prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", entry.hash());
cols++; cols++;
if (cols==8) { if (cols==8) {
@ -400,16 +400,16 @@ public class IndexControl_p {
return prop; return prop;
} }
public static serverObjects genUrlProfile(plasmaSwitchboard switchboard, plasmaCrawlLURLEntry entry, String urlhash) { public static serverObjects genUrlProfile(plasmaSwitchboard switchboard, indexURLEntry entry, String urlhash) {
serverObjects prop = new serverObjects(); serverObjects prop = new serverObjects();
if (entry == null) { if (entry == null) {
prop.put("genUrlProfile", 1); prop.put("genUrlProfile", 1);
prop.put("genUrlProfile_urlhash", urlhash); prop.put("genUrlProfile_urlhash", urlhash);
return prop; return prop;
} }
plasmaCrawlLURLEntry.Components comp = entry.comp(); indexURLEntry.Components comp = entry.comp();
String referrer = null; String referrer = null;
plasmaCrawlLURLEntry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null); indexURLEntry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null);
if (le == null) { if (le == null) {
referrer = "<unknown>"; referrer = "<unknown>";
} else { } else {
@ -453,11 +453,11 @@ public class IndexControl_p {
int i = 0; int i = 0;
final TreeMap tm = new TreeMap(); final TreeMap tm = new TreeMap();
indexEntry xi; indexRWIEntry xi;
while (en.hasNext()) { while (en.hasNext()) {
xi = (indexEntry) en.next(); xi = (indexRWIEntry) en.next();
uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())}; uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())};
plasmaCrawlLURLEntry le = switchboard.urlPool.loadedURL.load(uh[0], null); indexURLEntry le = switchboard.urlPool.loadedURL.load(uh[0], null);
if (le == null) { if (le == null) {
tm.put(uh[0], uh); tm.put(uh[0], uh);
} else { } else {

@ -60,6 +60,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter; import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaCrawlProfile;
@ -204,7 +205,7 @@ public class IndexCreate_p {
prop.put("error_reasonString", reasonString); prop.put("error_reasonString", reasonString);
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
crawlingStartURL.getHost(), reasonString, new bitfield(indexURL.urlFlagLength)); crawlingStartURL.getHost(), reasonString, new bitfield(indexRWIEntryOld.urlFlagLength));
ee.store(); ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee); switchboard.urlPool.errorURL.stackPushEntry(ee);
} }
@ -282,7 +283,7 @@ public class IndexCreate_p {
c++; c++;
} else { } else {
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
(String) e.getValue(), rejectReason, new bitfield(indexURL.urlFlagLength)); (String) e.getValue(), rejectReason, new bitfield(indexRWIEntryOld.urlFlagLength));
ee.store(); ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee); switchboard.urlPool.errorURL.stackPushEntry(ee);
} }

@ -43,22 +43,33 @@
// javac -classpath .:../Classes Settings_p.java // javac -classpath .:../Classes Settings_p.java
// if the shell's current path is HTROOT // if the shell's current path is HTROOT
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.nxTools;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
public class IndexMonitor { public class IndexMonitor {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
// return variable that accumulates replacements // return variable that accumulates replacements
plasmaSwitchboard switchboard = (plasmaSwitchboard) env; plasmaSwitchboard sb = (plasmaSwitchboard) env;
serverObjects prop = new serverObjects(); serverObjects prop = new serverObjects();
int showIndexedCount = 40; int lines = 40;
boolean si = false; boolean showInit = false;
boolean se = false; boolean showExec = false;
if (post == null) { if (post == null) {
@ -67,20 +78,20 @@ public class IndexMonitor {
} }
// find process number // find process number
int process; int tabletype;
try { try {
process = Integer.parseInt(post.get("process", "0")); tabletype = Integer.parseInt(post.get("process", "0"));
} catch (NumberFormatException e) { } catch (NumberFormatException e) {
process = 0; tabletype = 0;
} }
// check if authorization is needed and/or given // check if authorization is needed and/or given
if (((process > 0) && (process < 6)) || if (((tabletype > 0) && (tabletype < 6)) ||
(post.containsKey("clearlist")) || (post.containsKey("clearlist")) ||
(post.containsKey("deleteentry"))) { (post.containsKey("deleteentry"))) {
String authorization = ((String) header.get("Authorization", "xxxxxx")); String authorization = ((String) header.get("Authorization", "xxxxxx"));
if (authorization.length() != 0) { if (authorization.length() != 0) {
if (! switchboard.verifyAuthentication(header, true)){ if (! sb.verifyAuthentication(header, true)){
// force log-in (again, because wrong password was given) // force log-in (again, because wrong password was given)
prop.put("AUTHENTICATE", "admin log-in"); prop.put("AUTHENTICATE", "admin log-in");
return prop; return prop;
@ -94,33 +105,102 @@ public class IndexMonitor {
// custom number of lines // custom number of lines
if (post.containsKey("count")) { if (post.containsKey("count")) {
showIndexedCount = Integer.parseInt(post.get("count", "40")); lines = Integer.parseInt(post.get("count", "40"));
} }
// do the commands // do the commands
if (post.containsKey("clearlist")) switchboard.urlPool.loadedURL.clearStack(process); if (post.containsKey("clearlist")) sb.urlPool.loadedURL.clearStack(tabletype);
if (post.containsKey("deleteentry")) { if (post.containsKey("deleteentry")) {
String hash = post.get("hash", null); String hash = post.get("hash", null);
if (hash != null) { if (hash != null) {
// delete from database // delete from database
switchboard.urlPool.loadedURL.remove(hash); sb.urlPool.loadedURL.remove(hash);
} }
} }
if (post.containsKey("moreIndexed")) { if (post.containsKey("moreIndexed")) {
showIndexedCount = Integer.parseInt(post.get("showIndexed", "40")); lines = Integer.parseInt(post.get("showIndexed", "40"));
} }
if (post.get("si") != null) si = true; if (post.get("si") != null) showInit = true;
if (post.get("se") != null) se = true; if (post.get("se") != null) showExec = true;
// create table // create table
if (process == 0) { if (tabletype == 0) {
prop.put("table", 2); prop.put("table", 2);
} else if (sb.urlPool.loadedURL.getStackSize(tabletype) == 0) {
prop.put("table", 0);
} else {
prop.put("table", 1);
if (lines > sb.urlPool.loadedURL.getStackSize(tabletype)) lines = sb.urlPool.loadedURL.getStackSize(tabletype);
if (lines == sb.urlPool.loadedURL.getStackSize(tabletype)) {
prop.put("table_size", 0);
} else { } else {
prop.putAll(switchboard.urlPool.loadedURL.genTableProps(process, showIndexedCount, si, se, "unknown", null, "IndexMonitor.html", true)); prop.put("table_size", 1);
prop.put("table_size_count", lines);
}
prop.put("table_size_all", sb.urlPool.loadedURL.getStackSize(tabletype));
prop.put("table_feedbackpage", "IndexMonitor.html");
prop.put("table_tabletype", tabletype);
prop.put("table_showInit", (showInit) ? 1 : 0);
prop.put("table_showExec", (showExec) ? 1 : 0);
boolean dark = true;
String urlHash, initiatorHash, executorHash;
String cachepath, urlstr, urltxt;
yacySeed initiatorSeed, executorSeed;
indexURLEntry urle;
// needed for getCachePath(url)
final plasmaHTCache cacheManager = sb.getCacheManager();
int i, cnt = 0;
for (i = sb.urlPool.loadedURL.getStackSize(tabletype) - 1; i >= (sb.urlPool.loadedURL.getStackSize(tabletype) - lines); i--) {
initiatorHash = sb.urlPool.loadedURL.getInitiatorHash(tabletype, i);
executorHash = sb.urlPool.loadedURL.getExecutorHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps initiatorHash=" + initiatorHash + " executorHash=" + executorHash);
urlHash = sb.urlPool.loadedURL.getUrlHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash);
try {
urle = sb.urlPool.loadedURL.load(urlHash, null);
indexURLEntry.Components comp = urle.comp();
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString());
initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash);
executorSeed = yacyCore.seedDB.getConnected(executorHash);
urlstr = comp.url().toNormalform();
urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
cachepath = cacheManager.getCachePath(new URL(urlstr)).toString().replace('\\', '/').substring(cacheManager.cachePath.toString().length() + 1);
prop.put("table_indexed_" + cnt + "_dark", (dark) ? 1 : 0);
prop.put("table_indexed_" + cnt + "_feedbackpage", "IndexMonitor.html");
prop.put("table_indexed_" + cnt + "_tabletype", tabletype);
prop.put("table_indexed_" + cnt + "_urlhash", urlHash);
prop.put("table_indexed_" + cnt + "_showInit", (showInit) ? 1 : 0);
prop.put("table_indexed_" + cnt + "_showInit_initiatorSeed", (initiatorSeed == null) ? "unknown" : initiatorSeed.getName());
prop.put("table_indexed_" + cnt + "_showExec", (showExec) ? 1 : 0);
prop.put("table_indexed_" + cnt + "_showExec_executorSeed", (executorSeed == null) ? "unknown" : executorSeed.getName());
prop.put("table_indexed_" + cnt + "_moddate", daydate(urle.moddate()));
prop.put("table_indexed_" + cnt + "_wordcount", urle.wordCount());
prop.put("table_indexed_" + cnt + "_urldescr", comp.descr());
prop.put("table_indexed_" + cnt + "_url", (cachepath == null) ? "-not-cached-" : "<a href=\"CacheAdmin_p.html?action=info&path=" + cachepath + "\" class=\"small\" title=\"" + urlstr + "\">" + urltxt + "</a>");
dark = !dark;
cnt++;
} catch (Exception e) {
serverLog.logSevere("PLASMA", "genTableProps", e);
}
}
prop.put("table_indexed", cnt);
} }
prop.put("process", process); prop.put("process", tabletype);
// return rewrite properties // return rewrite properties
return prop; return prop;
} }
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
private static String daydate(Date date) {
if (date == null) {
return "";
} else {
return dayFormatter.format(date);
}
}
} }

@ -54,7 +54,7 @@ import java.util.Enumeration;
import de.anomic.data.wikiCode; import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.index.indexURLEntry;
import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
@ -106,7 +106,7 @@ public class ViewFile {
String viewMode = post.get("viewMode","sentences"); String viewMode = post.get("viewMode","sentences");
// getting the urlEntry that belongs to the url hash // getting the urlEntry that belongs to the url hash
plasmaCrawlLURLEntry urlEntry = null; indexURLEntry urlEntry = null;
urlEntry = sb.urlPool.loadedURL.load(urlHash, null); urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
if (urlEntry == null) { if (urlEntry == null) {
prop.put("error",2); prop.put("error",2);
@ -115,7 +115,7 @@ public class ViewFile {
} }
// gettin the url that belongs to the entry // gettin the url that belongs to the entry
plasmaCrawlLURLEntry.Components comp = urlEntry.comp(); indexURLEntry.Components comp = urlEntry.comp();
if ((comp == null) || (comp.url() == null)) { if ((comp == null) || (comp.url() == null)) {
prop.put("error",3); prop.put("error",3);
prop.put("viewMode",VIEW_MODE_NO_TEXT); prop.put("viewMode",VIEW_MODE_NO_TEXT);

@ -61,10 +61,10 @@ import de.anomic.data.userDB;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCondenser; import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCodings; import de.anomic.server.serverCodings;
import de.anomic.server.serverCore; import de.anomic.server.serverCore;
@ -362,7 +362,7 @@ public class dir {
try { try {
final URL url = new URL(urlstring); final URL url = new URL(urlstring);
final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes())); final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()));
final plasmaCrawlLURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry( final indexURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry(
url, url,
"YaCyShare: " + descr, "YaCyShare: " + descr,
yacyCore.seedDB.mySeed.getName(), yacyCore.seedDB.mySeed.getName(),

@ -50,8 +50,8 @@ import java.util.Date;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
@ -249,7 +249,7 @@ public final class crawlOrder {
// case where we have already the url loaded; // case where we have already the url loaded;
reason = reasonString; reason = reasonString;
// send lurl-Entry as response // send lurl-Entry as response
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null); indexURLEntry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null);
if (entry == null) { if (entry == null) {
response = "rejected"; response = "rejected";
lurl = ""; lurl = "";

@ -50,8 +50,9 @@ import java.io.IOException;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry;
import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
@ -124,12 +125,12 @@ public final class crawlReceipt {
prop.put("delay", "3600"); prop.put("delay", "3600");
} else if (result.equals("fill")) { } else if (result.equals("fill")) {
// generating a new loaded URL entry // generating a new loaded URL entry
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr); indexURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr);
if (entry == null) { if (entry == null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) for hash " + receivedUrlhash + " from peer " + iam + log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) for hash " + receivedUrlhash + " from peer " + iam +
"\n\tURL properties: "+ propStr); "\n\tURL properties: "+ propStr);
} else { } else {
plasmaCrawlLURLEntry.Components comp = entry.comp(); indexURLEntry.Components comp = entry.comp();
if (comp.url() == null) { if (comp.url() == null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + receivedUrlhash + " from peer " + iam + log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + receivedUrlhash + " from peer " + iam +
"\n\tURL properties: "+ propStr); "\n\tURL properties: "+ propStr);
@ -156,7 +157,7 @@ public final class crawlReceipt {
} else { } else {
try { try {
plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash); plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash);
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(indexURL.urlFlagLength)); plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(indexRWIEntryOld.urlFlagLength));
ee.store(); ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee); switchboard.urlPool.errorURL.stackPushEntry(ee);
switchboard.urlPool.noticeURL.remove(receivedUrlhash); switchboard.urlPool.noticeURL.remove(receivedUrlhash);

@ -54,7 +54,7 @@ import java.util.Set;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.index.indexURLEntry;
import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchRankingProfile;
@ -249,10 +249,10 @@ public final class search {
StringBuffer links = new StringBuffer(); StringBuffer links = new StringBuffer();
String resource = ""; String resource = "";
//plasmaIndexEntry pie; //plasmaIndexEntry pie;
plasmaCrawlLURLEntry urlentry; indexURLEntry urlentry;
plasmaSnippetCache.Snippet snippet; plasmaSnippetCache.Snippet snippet;
while ((acc.hasMoreElements()) && (i < squery.wantedResults)) { while ((acc.hasMoreElements()) && (i < squery.wantedResults)) {
urlentry = (plasmaCrawlLURLEntry) acc.nextElement(); urlentry = (indexURLEntry) acc.nextElement();
if (includesnippet) { if (includesnippet) {
snippet = sb.snippetCache.retrieveSnippet(urlentry.comp().url(), squery.queryHashes, false, 260, 1000); snippet = sb.snippetCache.retrieveSnippet(urlentry.comp().url(), squery.queryHashes, false, 260, 1000);
} else { } else {

@ -51,8 +51,8 @@ import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedList;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexEntry; import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURLEntry; import de.anomic.index.indexRWIEntryOld;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCore; import de.anomic.server.serverCore;
@ -146,7 +146,7 @@ public final class transferRWI {
int p; int p;
String wordHash; String wordHash;
String urlHash; String urlHash;
indexEntry iEntry; indexRWIEntry iEntry;
int wordhashesSize = v.size(); int wordhashesSize = v.size();
final HashSet unknownURL = new HashSet(); final HashSet unknownURL = new HashSet();
final HashSet knownURL = new HashSet(); final HashSet knownURL = new HashSet();
@ -162,7 +162,7 @@ public final class transferRWI {
if (p > 0) { if (p > 0) {
wordHash = estring.substring(0, p); wordHash = estring.substring(0, p);
wordhashes[received] = wordHash; wordhashes[received] = wordHash;
iEntry = new indexURLEntry(estring.substring(p)); iEntry = new indexRWIEntryOld(estring.substring(p));
urlHash = iEntry.urlHash(); urlHash = iEntry.urlHash();
if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.hashInBlacklistedCache(plasmaURLPattern.BLACKLIST_DHT, urlHash))) { if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.hashInBlacklistedCache(plasmaURLPattern.BLACKLIST_DHT, urlHash))) {
int deleted = sb.wordIndex.tryRemoveURLs(urlHash); int deleted = sb.wordIndex.tryRemoveURLs(urlHash);

@ -48,7 +48,7 @@
import java.io.IOException; import java.io.IOException;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.index.indexURLEntry;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCore; import de.anomic.server.serverCore;
@ -90,7 +90,7 @@ public final class transferURL {
final int sizeBefore = sb.urlPool.loadedURL.size(); final int sizeBefore = sb.urlPool.loadedURL.size();
// read the urls from the other properties and store // read the urls from the other properties and store
String urls; String urls;
plasmaCrawlLURLEntry lEntry; indexURLEntry lEntry;
for (int i = 0; i < urlc; i++) { for (int i = 0; i < urlc; i++) {
serverCore.checkInterruption(); serverCore.checkInterruption();
urls = (String) post.get("url" + i); urls = (String) post.get("url" + i);
@ -102,7 +102,7 @@ public final class transferURL {
yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls); yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
// TODO: should we send back an error message??? // TODO: should we send back an error message???
} else { } else {
plasmaCrawlLURLEntry.Components comp = lEntry.comp(); indexURLEntry.Components comp = lEntry.comp();
if (comp.url() == null) { if (comp.url() == null) {
yacyCore.log.logWarning("transferURL: received invalid URL (url null) from peer " + otherPeerName + "\n\tURL Property: " + urls); yacyCore.log.logWarning("transferURL: received invalid URL (url null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
// TODO: should we send back an error message??? // TODO: should we send back an error message???

@ -54,10 +54,10 @@ import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroMSetTools; import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSearchImages; import de.anomic.plasma.plasmaSearchImages;
import de.anomic.plasma.plasmaSearchPreOrder; import de.anomic.plasma.plasmaSearchPreOrder;
@ -189,9 +189,9 @@ public class yacysearch {
return prop; return prop;
} }
final String recommendHash = post.get("recommendref", ""); // urlhash final String recommendHash = post.get("recommendref", ""); // urlhash
plasmaCrawlLURLEntry urlentry = sb.urlPool.loadedURL.load(recommendHash, null); indexURLEntry urlentry = sb.urlPool.loadedURL.load(recommendHash, null);
if (urlentry != null) { if (urlentry != null) {
plasmaCrawlLURLEntry.Components comp = urlentry.comp(); indexURLEntry.Components comp = urlentry.comp();
plasmaParserDocument document; plasmaParserDocument document;
document = sb.snippetCache.retrieveDocument(comp.url(), true); document = sb.snippetCache.retrieveDocument(comp.url(), true);
if (document != null) { if (document != null) {

@ -91,7 +91,7 @@ public class indexCachedRI implements indexRI {
return new indexContainer(wordHash, payloadrow); return new indexContainer(wordHash, payloadrow);
} }
public indexContainer addEntry(String wordHash, indexEntry entry, long updateTime, boolean intern) { public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean intern) {
// add the entry // add the entry
if (intern) { if (intern) {
riIntern.addEntry(wordHash, entry, updateTime, true); riIntern.addEntry(wordHash, entry, updateTime, true);

@ -152,7 +152,7 @@ public class indexCollectionRI implements indexRI {
} }
} }
public synchronized indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) { public synchronized indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
indexContainer container = new indexContainer(wordHash, collectionIndex.payloadRow()); indexContainer container = new indexContainer(wordHash, collectionIndex.payloadRow());
container.add(newEntry); container.add(newEntry);
return addEntries(container, updateTime, dhtCase); return addEntries(container, updateTime, dhtCase);

@ -81,18 +81,18 @@ public class indexContainer extends kelondroRowSet {
return wordHash; return wordHash;
} }
public int add(indexEntry entry) { public int add(indexRWIEntry entry) {
this.addUnique(entry.toKelondroEntry()); this.addUnique(entry.toKelondroEntry());
return 1; return 1;
} }
public int add(indexEntry entry, long updateTime) { public int add(indexRWIEntry entry, long updateTime) {
this.add(entry); this.add(entry);
this.lastTimeWrote = updateTime; this.lastTimeWrote = updateTime;
return 1; return 1;
} }
public int add(indexEntry[] entries, long updateTime) { public int add(indexRWIEntry[] entries, long updateTime) {
for (int i = 0; i < entries.length; i++) this.add(entries[i], updateTime); for (int i = 0; i < entries.length; i++) this.add(entries[i], updateTime);
return entries.length; return entries.length;
} }
@ -106,7 +106,7 @@ public class indexContainer extends kelondroRowSet {
Iterator i = c.entries(); Iterator i = c.entries();
while (i.hasNext()) { while (i.hasNext()) {
try { try {
if (addi((indexEntry) i.next())) x++; if (addi((indexRWIEntry) i.next())) x++;
} catch (ConcurrentModificationException e) { } catch (ConcurrentModificationException e) {
e.printStackTrace(); e.printStackTrace();
} }
@ -117,13 +117,13 @@ public class indexContainer extends kelondroRowSet {
return x; return x;
} }
private boolean addi(indexEntry entry) { private boolean addi(indexRWIEntry entry) {
// returns true if the new entry was added, false if it already existed // returns true if the new entry was added, false if it already existed
kelondroRow.Entry oldEntryRow = this.put(entry.toKelondroEntry()); kelondroRow.Entry oldEntryRow = this.put(entry.toKelondroEntry());
if (oldEntryRow == null) { if (oldEntryRow == null) {
return true; return true;
} else { } else {
indexEntry oldEntry = new indexURLEntry(oldEntryRow); // FIXME: see if cloning is necessary indexRWIEntry oldEntry = new indexRWIEntryOld(oldEntryRow); // FIXME: see if cloning is necessary
if (entry.isOlder(oldEntry)) { // A more recent Entry is already in this container if (entry.isOlder(oldEntry)) { // A more recent Entry is already in this container
this.put(oldEntry.toKelondroEntry()); // put it back this.put(oldEntry.toKelondroEntry()); // put it back
return false; return false;
@ -133,16 +133,16 @@ public class indexContainer extends kelondroRowSet {
} }
} }
public indexEntry get(String urlHash) { public indexRWIEntry get(String urlHash) {
kelondroRow.Entry entry = this.get(urlHash.getBytes()); kelondroRow.Entry entry = this.get(urlHash.getBytes());
if (entry == null) return null; if (entry == null) return null;
return new indexURLEntry(entry); return new indexRWIEntryOld(entry);
} }
public indexEntry remove(String urlHash) { public indexRWIEntry remove(String urlHash) {
kelondroRow.Entry entry = this.remove(urlHash.getBytes()); kelondroRow.Entry entry = this.remove(urlHash.getBytes());
if (entry == null) return null; if (entry == null) return null;
return new indexURLEntry(entry); return new indexRWIEntryOld(entry);
} }
public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) { public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) {
@ -178,7 +178,7 @@ public class indexContainer extends kelondroRowSet {
public Object next() { public Object next() {
kelondroRow.Entry rentry = (kelondroRow.Entry) rowEntryIterator.next(); kelondroRow.Entry rentry = (kelondroRow.Entry) rowEntryIterator.next();
if (rentry == null) return null; if (rentry == null) return null;
return new indexURLEntry(rentry); return new indexRWIEntryOld(rentry);
} }
public void remove() { public void remove() {
@ -288,10 +288,10 @@ public class indexContainer extends kelondroRowSet {
assert small.rowdef.equals(large.rowdef) : "small = " + small.rowdef.toString() + "; large = " + large.rowdef.toString(); assert small.rowdef.equals(large.rowdef) : "small = " + small.rowdef.toString() + "; large = " + large.rowdef.toString();
indexContainer conj = new indexContainer(null, small.rowdef); // start with empty search result indexContainer conj = new indexContainer(null, small.rowdef); // start with empty search result
Iterator se = small.entries(); Iterator se = small.entries();
indexEntry ie0, ie1; indexRWIEntry ie0, ie1;
long stamp = System.currentTimeMillis(); long stamp = System.currentTimeMillis();
while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) { while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) {
ie0 = (indexEntry) se.next(); ie0 = (indexRWIEntry) se.next();
ie1 = large.get(ie0.urlHash()); ie1 = large.get(ie0.urlHash());
if (ie1 != null) { if (ie1 != null) {
// this is a hit. Calculate word distance: // this is a hit. Calculate word distance:
@ -312,25 +312,25 @@ public class indexContainer extends kelondroRowSet {
Iterator e2 = i2.entries(); Iterator e2 = i2.entries();
int c; int c;
if ((e1.hasNext()) && (e2.hasNext())) { if ((e1.hasNext()) && (e2.hasNext())) {
indexEntry ie1; indexRWIEntry ie1;
indexEntry ie2; indexRWIEntry ie2;
ie1 = (indexEntry) e1.next(); ie1 = (indexRWIEntry) e1.next();
ie2 = (indexEntry) e2.next(); ie2 = (indexRWIEntry) e2.next();
long stamp = System.currentTimeMillis(); long stamp = System.currentTimeMillis();
while ((System.currentTimeMillis() - stamp) < time) { while ((System.currentTimeMillis() - stamp) < time) {
c = i1.order().compare(ie1.urlHash(), ie2.urlHash()); c = i1.order().compare(ie1.urlHash(), ie2.urlHash());
//System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c); //System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c);
if (c < 0) { if (c < 0) {
if (e1.hasNext()) ie1 = (indexEntry) e1.next(); else break; if (e1.hasNext()) ie1 = (indexRWIEntry) e1.next(); else break;
} else if (c > 0) { } else if (c > 0) {
if (e2.hasNext()) ie2 = (indexEntry) e2.next(); else break; if (e2.hasNext()) ie2 = (indexRWIEntry) e2.next(); else break;
} else { } else {
// we have found the same urls in different searches! // we have found the same urls in different searches!
ie1.combineDistance(ie2); ie1.combineDistance(ie2);
if (ie1.worddistance() <= maxDistance) conj.add(ie1); if (ie1.worddistance() <= maxDistance) conj.add(ie1);
if (e1.hasNext()) ie1 = (indexEntry) e1.next(); else break; if (e1.hasNext()) ie1 = (indexRWIEntry) e1.next(); else break;
if (e2.hasNext()) ie2 = (indexEntry) e2.next(); else break; if (e2.hasNext()) ie2 = (indexRWIEntry) e2.next(); else break;
} }
} }
} }

@ -35,10 +35,6 @@ import de.anomic.yacy.yacySeedDB;
public class indexEntryAttribute { public class indexEntryAttribute {
// the size of a word hash
public static final int wordHashLength = yacySeedDB.commonHashLength; // 12
public static final int urlHashLength = yacySeedDB.commonHashLength; // 12
// doctypes: // doctypes:
public static final char DT_PDFPS = 'p'; public static final char DT_PDFPS = 'p';
public static final char DT_TEXT = 't'; public static final char DT_TEXT = 't';
@ -86,7 +82,7 @@ public class indexEntryAttribute {
// create a word hash // create a word hash
public static String word2hash(String word) { public static String word2hash(String word) {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, indexEntryAttribute.wordHashLength); return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, yacySeedDB.commonHashLength);
} }
// doctype calculation // doctype calculation

@ -81,7 +81,7 @@ public final class indexRAMRI implements indexRI {
this.indexArrayFileName = dumpname; this.indexArrayFileName = dumpname;
this.payloadrow = payloadrow; this.payloadrow = payloadrow;
this.bufferStructureBasis = new kelondroRow( this.bufferStructureBasis = new kelondroRow(
"byte[] wordhash-" + indexEntryAttribute.wordHashLength + ", " + "byte[] wordhash-" + yacySeedDB.commonHashLength + ", " +
"Cardinal occ-4 {b256}, " + "Cardinal occ-4 {b256}, " +
"Cardinal time-8 {b256}, " + "Cardinal time-8 {b256}, " +
"byte[] urlprops-" + payloadrow.objectsize()); "byte[] urlprops-" + payloadrow.objectsize());
@ -114,7 +114,7 @@ public final class indexRAMRI implements indexRI {
String wordHash; String wordHash;
indexContainer container; indexContainer container;
long updateTime; long updateTime;
indexEntry iEntry; indexRWIEntry iEntry;
kelondroRow.Entry row = dumpArray.row().newEntry(); kelondroRow.Entry row = dumpArray.row().newEntry();
// write wCache // write wCache
@ -131,7 +131,7 @@ public final class indexRAMRI implements indexRI {
if (container != null) { if (container != null) {
Iterator ci = container.entries(); Iterator ci = container.entries();
while (ci.hasNext()) { while (ci.hasNext()) {
iEntry = (indexEntry) ci.next(); iEntry = (indexRWIEntry) ci.next();
row.setCol(0, wordHash.getBytes()); row.setCol(0, wordHash.getBytes());
row.setCol(1, kelondroNaturalOrder.encodeLong(container.size(), 4)); row.setCol(1, kelondroNaturalOrder.encodeLong(container.size(), 4));
row.setCol(2, kelondroNaturalOrder.encodeLong(updateTime, 8)); row.setCol(2, kelondroNaturalOrder.encodeLong(updateTime, 8));
@ -169,7 +169,7 @@ public final class indexRAMRI implements indexRI {
Iterator i = dumpArray.contentRows(-1); Iterator i = dumpArray.contentRows(-1);
String wordHash; String wordHash;
//long creationTime; //long creationTime;
indexEntry wordEntry; indexRWIEntry wordEntry;
kelondroRow.Entry row; kelondroRow.Entry row;
//Runtime rt = Runtime.getRuntime(); //Runtime rt = Runtime.getRuntime();
while (i.hasNext()) { while (i.hasNext()) {
@ -178,7 +178,7 @@ public final class indexRAMRI implements indexRI {
if ((row == null) || (row.empty(0)) || (row.empty(3))) continue; if ((row == null) || (row.empty(0)) || (row.empty(3))) continue;
wordHash = row.getColString(0, "UTF-8"); wordHash = row.getColString(0, "UTF-8");
//creationTime = kelondroRecords.bytes2long(row[2]); //creationTime = kelondroRecords.bytes2long(row[2]);
wordEntry = new indexURLEntry(row.getColBytes(3)); wordEntry = new indexRWIEntryOld(row.getColBytes(3));
// store to cache // store to cache
addEntry(wordHash, wordEntry, startTime, false); addEntry(wordHash, wordEntry, startTime, false);
urlCount++; urlCount++;
@ -437,10 +437,10 @@ public final class indexRAMRI implements indexRI {
return null; return null;
} }
public synchronized indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) { public synchronized indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
indexContainer container = (indexContainer) cache.get(wordHash); indexContainer container = (indexContainer) cache.get(wordHash);
if (container == null) container = new indexContainer(wordHash, this.payloadrow); if (container == null) container = new indexContainer(wordHash, this.payloadrow);
indexEntry[] entries = new indexEntry[] { newEntry }; indexRWIEntry[] entries = new indexRWIEntry[] { newEntry };
if (container.add(entries, updateTime) > 0) { if (container.add(entries, updateTime) > 0) {
cache.put(wordHash, container); cache.put(wordHash, container);
hashScore.incScore(wordHash); hashScore.incScore(wordHash);

@ -44,7 +44,7 @@ public interface indexRI {
public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete); public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete);
public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete); public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete);
public indexContainer addEntry(String wordHash, indexEntry entry, long updateTime, boolean dhtCase); public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtCase);
public indexContainer addEntries(indexContainer newEntries, long creationTime, boolean dhtCase); public indexContainer addEntries(indexContainer newEntries, long creationTime, boolean dhtCase);
public void close(int waitingSeconds); public void close(int waitingSeconds);

@ -1,4 +1,4 @@
// indexEntry.java // indexRWIEntry.java
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany // (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 20.05.2006 on http://www.anomic.de // first published 20.05.2006 on http://www.anomic.de
// //
@ -28,7 +28,7 @@ package de.anomic.index;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
public interface indexEntry { public interface indexRWIEntry {
public Object clone(); public Object clone();
public String toPropertyForm(boolean displayFormat); public String toPropertyForm(boolean displayFormat);
@ -48,13 +48,13 @@ public interface indexEntry {
public char getType(); public char getType();
public boolean isLocal(); public boolean isLocal();
public void combineDistance(indexEntry oe); public void combineDistance(indexRWIEntry oe);
public int worddistance(); public int worddistance();
public void min(indexEntry other); public void min(indexRWIEntry other);
public void max(indexEntry other); public void max(indexRWIEntry other);
public void normalize(indexEntry min, indexEntry max); public void normalize(indexRWIEntry min, indexRWIEntry max);
public indexEntry generateNormalized(indexEntry min, indexEntry max); public indexRWIEntry generateNormalized(indexRWIEntry min, indexRWIEntry max);
public boolean isNewer(indexEntry other); public boolean isNewer(indexRWIEntry other);
public boolean isOlder(indexEntry other); public boolean isOlder(indexRWIEntry other);
} }

@ -0,0 +1,323 @@
// indexURLEntryNew.java
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 21.07.2006 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.index;
import de.anomic.kelondro.kelondroColumn;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRow.Entry;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.yacy.yacySeedDB;
public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
// this object stores attributes to URL references inside RWI collections
// statics for value lengths
public static final int urlStringLength = 256;// not too short for links without parameters
public static final int urlDescrLength = 80; // The headline of a web page (meta-tag or <h1>)
public static final int urlNameLength = 40; // the tag content between <a> and </a>
public static final int urldescrtagsLength = 320;// the url, the description and tags in one string
public static final int urlErrorLength = 80; // a reason description for unavailable urls
public static final int urlDateLength = 4; // any date, shortened
public static final int urlCopyCountLength = 2; // counter for numbers of copies of this index
public static final int urlFlagLength = 2; // any stuff
public static final int urlLanguageLength = 2; // taken from TLD suffix as quick-hack
public static final int urlDoctypeLength = 1; // taken from extension
public static final int urlSizeLength = 6; // the source size, from cache
public static final int urlWordCountLength = 3; // the number of words, from condenser
public static final int urlCrawlProfileHandleLength = 4; // name of the prefetch profile
public static final int urlCrawlDepthLength = 2; // prefetch depth, first is '0'
public static final int urlParentBranchesLength = 3; // number of anchors of the parent
public static final int urlForkFactorLength = 4; // sum of anchors of all ancestors
public static final int urlRetryLength = 2; // number of load retries
public static final int urlHostLength = 8; // the host as struncated name
public static final int urlHandleLength = 4; // a handle
public static final int urlQualityLength = 3; // taken from heuristic
public static kelondroRow urlEntryRow = new kelondroRow(new kelondroColumn[]{
new kelondroColumn("h", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, yacySeedDB.commonHashLength, "urlhash"),
new kelondroColumn("q", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, urlQualityLength, "quality"),
new kelondroColumn("a", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 3, "lastModified"),
new kelondroColumn("c", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "hitcount"),
new kelondroColumn("l", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, urlLanguageLength, "language"),
new kelondroColumn("d", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "doctype"),
new kelondroColumn("f", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "localflag"),
new kelondroColumn("t", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posintext"),
new kelondroColumn("r", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posinphrase"),
new kelondroColumn("o", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posofphrase"),
new kelondroColumn("i", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "worddistance"),
new kelondroColumn("w", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "wordcount"),
new kelondroColumn("p", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "phrasecount")
});
private static final int col_urlhash = 0;
private static final int col_quality = 1;
private static final int col_lastModified = 2;
private static final int col_hitcount = 3;
private static final int col_language = 4;
private static final int col_doctype = 5;
private static final int col_localflag = 6;
private static final int col_posintext = 7;
private static final int col_posinphrase = 8;
private static final int col_posofphrase = 9;
private static final int col_worddistance = 10;
private static final int col_wordcount = 11;
private static final int col_phrasecount = 12;
private kelondroRow.Entry entry;
public indexRWIEntryOld(String urlHash,
int urlLength, // byte-length of complete URL
int urlComps, // number of path components
int titleLength, // length of description/length (longer are better?)
int hitcount, //*how often appears this word in the text
int wordcount, //*total number of words
int phrasecount, //*total number of phrases
int posintext, //*position of word in all words
int posinphrase, //*position of word in its phrase
int posofphrase, //*number of the phrase where word appears
int worddistance, //*word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
int sizeOfPage, // # of bytes of the page
long lastmodified, //*last-modified time of the document where word appears
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
int quality, //*the entropy value
String language, //*(guessed) language of document
char doctype, //*type of document
int outlinksSame, // outlinks to same domain
int outlinksOther,// outlinks to other domain
boolean local //*flag shows that this index was generated locally; othervise its from a remote peer
) {
// more needed attributes:
// - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag, hervorhebungen, meta-tags, word in link etc
// - boolean: URL attributes
assert (urlHash.length() == 12) : "urlhash = " + urlHash;
if ((language == null) || (language.length() != urlLanguageLength)) language = "uk";
this.entry = urlEntryRow.newEntry();
this.entry.setCol(col_urlhash, urlHash, null);
this.entry.setCol(col_quality, quality);
this.entry.setCol(col_lastModified, lastmodified);
this.entry.setCol(col_hitcount, hitcount);
this.entry.setCol(col_language, language, null);
this.entry.setCol(col_doctype, (byte) doctype);
this.entry.setCol(col_localflag, (byte) ((local) ? indexEntryAttribute.LT_LOCAL : indexEntryAttribute.LT_GLOBAL));
this.entry.setCol(col_posintext, posintext);
this.entry.setCol(col_posinphrase, posinphrase);
this.entry.setCol(col_posofphrase, posofphrase);
this.entry.setCol(col_worddistance, worddistance);
this.entry.setCol(col_wordcount, wordcount);
this.entry.setCol(col_phrasecount, phrasecount);
//System.out.println("DEBUG-NEWENTRY " + toPropertyForm());
}
public indexRWIEntryOld(String urlHash, String code) {
// the code is the external form of the row minus the leading urlHash entry
this.entry = urlEntryRow.newEntry((urlHash + code).getBytes());
}
public indexRWIEntryOld(String external) {
this.entry = urlEntryRow.newEntry(external);
}
public indexRWIEntryOld(byte[] row) {
this.entry = urlEntryRow.newEntry(row);
}
public indexRWIEntryOld(kelondroRow.Entry rentry) {
// FIXME: see if cloning is necessary
this.entry = rentry;
}
public Object clone() {
byte[] b = new byte[urlEntryRow.objectsize()];
System.arraycopy(entry.bytes(), 0, b, 0, urlEntryRow.objectsize());
return new indexRWIEntryOld(b);
}
public String toPropertyForm(boolean displayFormat) {
return entry.toPropertyForm(true, displayFormat, displayFormat);
}
public Entry toKelondroEntry() {
return this.entry;
}
public String urlHash() {
return this.entry.getColString(col_urlhash, null);
}
public int quality() {
return (int) this.entry.getColLong(col_quality);
}
public int virtualAge() {
return plasmaWordIndex.microDateDays(lastModified());
}
public long lastModified() {
return (int) this.entry.getColLong(col_lastModified);
}
public int hitcount() {
return (int) this.entry.getColLong(col_hitcount);
}
public int posintext() {
return (int) this.entry.getColLong(col_posintext);
}
public int posinphrase() {
return (int) this.entry.getColLong(col_posinphrase);
}
public int posofphrase() {
return (int) this.entry.getColLong(col_posofphrase);
}
public int wordcount() {
return (int) this.entry.getColLong(col_wordcount);
}
public int phrasecount() {
return (int) this.entry.getColLong(col_phrasecount);
}
public String getLanguage() {
return this.entry.getColString(col_language, null);
}
public char getType() {
return (char) this.entry.getColByte(col_doctype);
}
public boolean isLocal() {
return this.entry.getColByte(col_localflag) == indexEntryAttribute.LT_LOCAL;
}
public static indexRWIEntryOld combineDistance(indexRWIEntryOld ie1, indexRWIEntry ie2) {
// returns a modified entry of the first argument
ie1.entry.setCol(col_worddistance, ie1.worddistance() + ie2.worddistance() + Math.abs(ie1.posintext() - ie2.posintext()));
ie1.entry.setCol(col_posintext, Math.min(ie1.posintext(), ie2.posintext()));
ie1.entry.setCol(col_posinphrase, (ie1.posofphrase() == ie2.posofphrase()) ? ie1.posofphrase() : 0 /*unknown*/);
ie1.entry.setCol(col_posofphrase, Math.min(ie1.posofphrase(), ie2.posofphrase()));
ie1.entry.setCol(col_wordcount, (ie1.wordcount() + ie2.wordcount()) / 2);
return ie1;
}
public void combineDistance(indexRWIEntry oe) {
combineDistance(this, oe);
}
public int worddistance() {
return (int) this.entry.getColLong(col_worddistance);
}
public static final void min(indexRWIEntryOld t, indexRWIEntry other) {
if (t.hitcount() > other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount());
if (t.wordcount() > other.wordcount()) t.entry.setCol(col_wordcount, other.wordcount());
if (t.phrasecount() > other.phrasecount()) t.entry.setCol(col_phrasecount, other.phrasecount());
if (t.posintext() > other.posintext()) t.entry.setCol(col_posintext, other.posintext());
if (t.posinphrase() > other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase());
if (t.posofphrase() > other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase());
if (t.worddistance() > other.worddistance()) t.entry.setCol(col_worddistance, other.worddistance());
if (t.lastModified() > other.lastModified()) t.entry.setCol(col_lastModified, other.lastModified());
if (t.quality() > other.quality()) t.entry.setCol(col_quality, other.quality());
}
public static final void max(indexRWIEntryOld t, indexRWIEntry other) {
if (t.hitcount() < other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount());
if (t.wordcount() < other.wordcount()) t.entry.setCol(col_wordcount, other.wordcount());
if (t.phrasecount() < other.phrasecount()) t.entry.setCol(col_phrasecount, other.phrasecount());
if (t.posintext() < other.posintext()) t.entry.setCol(col_posintext, other.posintext());
if (t.posinphrase() < other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase());
if (t.posofphrase() < other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase());
if (t.worddistance() < other.worddistance()) t.entry.setCol(col_worddistance, other.worddistance());
if (t.lastModified() < other.lastModified()) t.entry.setCol(col_lastModified, other.lastModified());
if (t.quality() < other.quality()) t.entry.setCol(col_quality, other.quality());
}
public void min(indexRWIEntry other) {
min(this, other);
}
public void max(indexRWIEntry other) {
max(this, other);
}
static void normalize(indexRWIEntryOld t, indexRWIEntry min, indexRWIEntry max) {
assert (t.urlHash().length() == 12) : "turlhash = " + t.urlHash();
assert (min.urlHash().length() == 12) : "minurlhash = " + min.urlHash();
assert (max.urlHash().length() == 12) : "maxurlhash = " + max.urlHash();
if (1 + max.worddistance() - min.worddistance() == 0) System.out.println("min = " + min.toPropertyForm(true) + "\nmax=" + max.toPropertyForm(true));
//System.out.println("Normalize:\nentry = " + t.toPropertyForm(true));
//System.out.println("min = " + min.toPropertyForm(true));
//System.out.println("max = " + max.toPropertyForm(true));
t.entry.setCol(col_hitcount , (t.hitcount() == 0) ? 0 : 1 + 255 * (t.hitcount() - min.hitcount() ) / (1 + max.hitcount() - min.hitcount()));
t.entry.setCol(col_wordcount , (t.wordcount() == 0) ? 0 : 1 + 255 * (t.wordcount() - min.wordcount() ) / (1 + max.wordcount() - min.wordcount()));
t.entry.setCol(col_phrasecount , (t.phrasecount() == 0) ? 0 : 1 + 255 * (t.phrasecount() - min.phrasecount() ) / (1 + max.phrasecount() - min.phrasecount()));
t.entry.setCol(col_posintext , (t.posintext() == 0) ? 0 : 1 + 255 * (t.posintext() - min.posintext() ) / (1 + max.posintext() - min.posintext()));
t.entry.setCol(col_posinphrase , (t.posinphrase() == 0) ? 0 : 1 + 255 * (t.posinphrase() - min.posinphrase() ) / (1 + max.posinphrase() - min.posinphrase()));
t.entry.setCol(col_posofphrase , (t.posofphrase() == 0) ? 0 : 1 + 255 * (t.posofphrase() - min.posofphrase() ) / (1 + max.posofphrase() - min.posofphrase()));
t.entry.setCol(col_worddistance , (t.worddistance() == 0) ? 0 : 1 + 255 * (t.worddistance() - min.worddistance()) / (1 + max.worddistance() - min.worddistance())); // FIXME: hier gibts ein division by zero, was nur sein kann wenn die Normalisierung nicht geklappt hat.
t.entry.setCol(col_lastModified , (t.lastModified() == 0) ? 0 : 1 + 255 * (t.lastModified() - min.lastModified()) / (1 + max.lastModified() - min.lastModified()));
t.entry.setCol(col_quality , (t.quality() == 0) ? 0 : 1 + 255 * (t.quality() - min.quality() ) / (1 + max.quality() - min.quality()));
//System.out.println("out = " + t.toPropertyForm(true));
}
public void normalize(indexRWIEntry min, indexRWIEntry max) {
normalize(this, min, max);
}
public indexRWIEntry generateNormalized(indexRWIEntry min, indexRWIEntry max) {
assert (this.urlHash().length() == 12) : "this.urlhash = " + this.urlHash();
indexRWIEntryOld e = (indexRWIEntryOld) this.clone();
e.normalize(min, max);
return e;
}
public boolean isNewer(indexRWIEntry other) {
if (other == null) return true;
if (this.lastModified() > other.lastModified()) return true;
if (this.lastModified() == other.lastModified()) {
if (this.quality() > other.quality()) return true;
}
return false;
}
public boolean isOlder(indexRWIEntry other) {
if (other == null) return false;
if (this.lastModified() < other.lastModified()) return true;
if (this.lastModified() == other.lastModified()) {
if (this.quality() < other.quality()) return true;
}
return false;
}
}

@ -50,29 +50,6 @@ public class indexURL {
// day formatter for entry export // day formatter for entry export
public static final SimpleDateFormat shortDayFormatter = new SimpleDateFormat("yyyyMMdd"); public static final SimpleDateFormat shortDayFormatter = new SimpleDateFormat("yyyyMMdd");
// statics for value lengths
public static final int urlHashLength = yacySeedDB.commonHashLength; // 12
public static final int urlStringLength = 256;// not too short for links without parameters
public static final int urlDescrLength = 80; // The headline of a web page (meta-tag or <h1>)
public static final int urlNameLength = 40; // the tag content between <a> and </a>
public static final int urldescrtagsLength = 320;// the url, the description and tags in one string
public static final int urlErrorLength = 80; // a reason description for unavailable urls
public static final int urlDateLength = 4; // any date, shortened
public static final int urlCopyCountLength = 2; // counter for numbers of copies of this index
public static final int urlFlagLength = 2; // any stuff
public static final int urlQualityLength = 3; // taken from heuristic
public static final int urlLanguageLength = 2; // taken from TLD suffix as quick-hack
public static final int urlDoctypeLength = 1; // taken from extension
public static final int urlSizeLength = 6; // the source size, from cache
public static final int urlWordCountLength = 3; // the number of words, from condenser
public static final int urlCrawlProfileHandleLength = 4; // name of the prefetch profile
public static final int urlCrawlDepthLength = 2; // prefetch depth, first is '0'
public static final int urlParentBranchesLength = 3; // number of anchors of the parent
public static final int urlForkFactorLength = 4; // sum of anchors of all ancestors
public static final int urlRetryLength = 2; // number of load retries
public static final int urlHostLength = 8; // the host as struncated name
public static final int urlHandleLength = 4; // a handle
private static final String[] TLD_NorthAmericaOceania={ private static final String[] TLD_NorthAmericaOceania={
// primary english-speaking countries // primary english-speaking countries
// english-speaking countries from central america are also included // english-speaking countries from central america are also included
@ -397,7 +374,7 @@ public class indexURL {
static { static {
// create a dummy hash // create a dummy hash
dummyHash = ""; dummyHash = "";
for (int i = 0; i < urlHashLength; i++) dummyHash += "-"; for (int i = 0; i < yacySeedDB.commonHashLength; i++) dummyHash += "-";
// assign TLD-ids and names // assign TLD-ids and names
insertTLDProps(TLD_EuropaRussia, 0); insertTLDProps(TLD_EuropaRussia, 0);
@ -602,13 +579,13 @@ public class indexURL {
public static final String oldurlHash(URL url) { public static final String oldurlHash(URL url) {
if (url == null) return null; if (url == null) return null;
String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(url.toNormalform())).substring(0, urlHashLength); String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(url.toNormalform())).substring(0, yacySeedDB.commonHashLength);
return hash; return hash;
} }
public static final String oldurlHash(String url) throws MalformedURLException { public static final String oldurlHash(String url) throws MalformedURLException {
if ((url == null) || (url.length() < 10)) return null; if ((url == null) || (url.length() < 10)) return null;
String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(new URL(url).toNormalform())).substring(0, urlHashLength); String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(new URL(url).toNormalform())).substring(0, yacySeedDB.commonHashLength);
return hash; return hash;
} }
@ -618,10 +595,10 @@ public class indexURL {
TreeMap doms = new TreeMap(); TreeMap doms = new TreeMap();
synchronized(inputContainer) { synchronized(inputContainer) {
Iterator i = inputContainer.entries(); Iterator i = inputContainer.entries();
indexEntry iEntry; indexRWIEntry iEntry;
String dom, paths; String dom, paths;
while (i.hasNext()) { while (i.hasNext()) {
iEntry = (indexEntry) i.next(); iEntry = (indexRWIEntry) i.next();
if ((excludeContainer != null) && (excludeContainer.get(iEntry.urlHash()) != null)) continue; // do not include urls that are in excludeContainer if ((excludeContainer != null) && (excludeContainer.get(iEntry.urlHash()) != null)) continue; // do not include urls that are in excludeContainer
dom = iEntry.urlHash().substring(6); dom = iEntry.urlHash().substring(6);
if ((paths = (String) doms.get(dom)) == null) { if ((paths = (String) doms.get(dom)) == null) {

@ -1,6 +1,6 @@
// plasmaCrawlLURLEntry.java // indexURLEntry.java
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany // (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 13.10.2006 on http://www.anomic.de // first published 2006 on http://www.anomic.de
// //
// This is a part of YaCy, a peer-to-peer based web search engine // This is a part of YaCy, a peer-to-peer based web search engine
// //
@ -24,7 +24,8 @@
// along with this program; if not, write to the Free Software // along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma;
package de.anomic.index;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
@ -32,9 +33,9 @@ import java.util.Date;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.index.indexEntry; import de.anomic.index.indexRWIEntry;
public interface plasmaCrawlLURLEntry { public interface indexURLEntry {
public kelondroRow.Entry toRowEntry() throws IOException; public kelondroRow.Entry toRowEntry() throws IOException;
public String hash(); public String hash();
@ -48,8 +49,8 @@ public interface plasmaCrawlLURLEntry {
public int size(); public int size();
public int wordCount(); public int wordCount();
public String snippet(); public String snippet();
public indexEntry word(); public indexRWIEntry word();
public boolean isOlder(plasmaCrawlLURLEntry other); public boolean isOlder(indexURLEntry other);
public String toString(String snippet); public String toString(String snippet);
public String toString(); public String toString();

@ -1,4 +1,4 @@
package de.anomic.plasma; package de.anomic.index;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
@ -7,9 +7,6 @@ import java.util.Date;
import java.util.Properties; import java.util.Properties;
import java.util.ArrayList; import java.util.ArrayList;
import de.anomic.index.indexEntry;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
@ -20,7 +17,9 @@ import de.anomic.tools.crypt;
import de.anomic.tools.bitfield; import de.anomic.tools.bitfield;
import de.anomic.tools.nxTools; import de.anomic.tools.nxTools;
public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry { public class indexURLEntryNew implements indexURLEntry {
// this object stores attributes for URL entries
public static final kelondroRow rowdef = new kelondroRow( public static final kelondroRow rowdef = new kelondroRow(
"String hash-12, " + // the url's hash "String hash-12, " + // the url's hash
@ -44,9 +43,9 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
private kelondroRow.Entry entry; private kelondroRow.Entry entry;
private String snippet; private String snippet;
private indexEntry word; // this is only used if the url is transported via remote search requests private indexRWIEntry word; // this is only used if the url is transported via remote search requests
public plasmaCrawlLURLNewEntry( public indexURLEntryNew(
URL url, URL url,
String descr, String descr,
String author, String author,
@ -106,13 +105,13 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
return s.toString().getBytes(); return s.toString().getBytes();
} }
public plasmaCrawlLURLNewEntry(kelondroRow.Entry entry, indexEntry searchedWord) { public indexURLEntryNew(kelondroRow.Entry entry, indexRWIEntry searchedWord) {
this.entry = entry; this.entry = entry;
this.snippet = null; this.snippet = null;
this.word = searchedWord; this.word = searchedWord;
} }
public plasmaCrawlLURLNewEntry(Properties prop){ public indexURLEntryNew(Properties prop){
// generates an plasmaLURLEntry using the properties from the argument // generates an plasmaLURLEntry using the properties from the argument
// the property names must correspond to the one from toString // the property names must correspond to the one from toString
//System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); //System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
@ -159,12 +158,12 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
this.entry.setCol("lvideo", Integer.parseInt(prop.getProperty("lvideo", "0"))); this.entry.setCol("lvideo", Integer.parseInt(prop.getProperty("lvideo", "0")));
this.entry.setCol("lapp", Integer.parseInt(prop.getProperty("lapp", "0"))); this.entry.setCol("lapp", Integer.parseInt(prop.getProperty("lapp", "0")));
this.snippet = crypt.simpleDecode(prop.getProperty("snippet", ""), null); this.snippet = crypt.simpleDecode(prop.getProperty("snippet", ""), null);
this.word = (prop.containsKey("word")) ? new indexURLEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))) : null; this.word = (prop.containsKey("word")) ? new indexRWIEntryOld(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))) : null;
} }
private StringBuffer corePropList() { private StringBuffer corePropList() {
// generate a parseable string; this is a simple property-list // generate a parseable string; this is a simple property-list
plasmaCrawlLURLEntry.Components comp = this.comp(); indexURLEntry.Components comp = this.comp();
final StringBuffer s = new StringBuffer(300); final StringBuffer s = new StringBuffer(300);
try { try {
s.append("hash=").append(hash()); s.append("hash=").append(hash());
@ -217,9 +216,9 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
return this.entry.getColString("hash", "", null); return this.entry.getColString("hash", "", null);
} }
public plasmaCrawlLURLEntry.Components comp() { public indexURLEntry.Components comp() {
ArrayList cl = nxTools.strings(this.entry.getCol("comp", null), "UTF-8"); ArrayList cl = nxTools.strings(this.entry.getCol("comp", null), "UTF-8");
return new de.anomic.plasma.plasmaCrawlLURLEntry.Components( return new indexURLEntry.Components(
(cl.size() > 0) ? (String) cl.get(0) : "", (cl.size() > 0) ? (String) cl.get(0) : "",
(cl.size() > 1) ? (String) cl.get(1) : "", (cl.size() > 1) ? (String) cl.get(1) : "",
(cl.size() > 2) ? (String) cl.get(2) : "", (cl.size() > 2) ? (String) cl.get(2) : "",
@ -299,11 +298,11 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
return snippet; return snippet;
} }
public indexEntry word() { public indexRWIEntry word() {
return word; return word;
} }
public boolean isOlder(plasmaCrawlLURLEntry other) { public boolean isOlder(indexURLEntry other) {
if (other == null) return false; if (other == null) return false;
Date tmoddate = moddate(); Date tmoddate = moddate();
Date omoddate = other.moddate(); Date omoddate = other.moddate();

@ -24,39 +24,37 @@
// along with this program; if not, write to the Free Software // along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma; package de.anomic.index;
import java.io.IOException; import java.io.IOException;
import java.util.Date; import java.util.Date;
import java.util.Properties; import java.util.Properties;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.index.indexEntry;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield; import de.anomic.tools.bitfield;
import de.anomic.tools.crypt; import de.anomic.tools.crypt;
import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { public class indexURLEntryOld implements indexURLEntry {
public static final kelondroRow rowdef = new kelondroRow( public static final kelondroRow rowdef = new kelondroRow(
"String urlhash-" + indexURL.urlHashLength + ", " + // the url's hash "String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String urlstring-" + indexURL.urlStringLength + ", " + // the url as string "String urlstring-" + indexRWIEntryOld.urlStringLength + ", " + // the url as string
"String urldescr-" + indexURL.urlDescrLength + ", " + // the description of the url "String urldescr-" + indexRWIEntryOld.urlDescrLength + ", " + // the description of the url
"Cardinal moddate-" + indexURL.urlDateLength + " {b64e}, " + // last-modified from the httpd "Cardinal moddate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // last-modified from the httpd
"Cardinal loaddate-" + indexURL.urlDateLength + " {b64e}, " + // time when the url was loaded "Cardinal loaddate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // time when the url was loaded
"String refhash-" + indexURL.urlHashLength + ", " + // the url's referrer hash "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"Cardinal copycount-" + indexURL.urlCopyCountLength + " {b64e}, " + // "Cardinal copycount-" + indexRWIEntryOld.urlCopyCountLength + " {b64e}, " + //
"byte[] flags-" + indexURL.urlFlagLength + ", " + // flags "byte[] flags-" + indexRWIEntryOld.urlFlagLength + ", " + // flags
"Cardinal quality-" + indexURL.urlQualityLength + " {b64e}, " + // "Cardinal quality-" + indexRWIEntryOld.urlQualityLength + " {b64e}, " + //
"String language-" + indexURL.urlLanguageLength + ", " + // "String language-" + indexRWIEntryOld.urlLanguageLength + ", " + //
"byte[] doctype-" + indexURL.urlDoctypeLength + ", " + // "byte[] doctype-" + indexRWIEntryOld.urlDoctypeLength + ", " + //
"Cardinal size-" + indexURL.urlSizeLength + " {b64e}, " + // size of file in bytes "Cardinal size-" + indexRWIEntryOld.urlSizeLength + " {b64e}, " + // size of file in bytes
"Cardinal wc-" + indexURL.urlWordCountLength + " {b64e}"); // word count "Cardinal wc-" + indexRWIEntryOld.urlWordCountLength + " {b64e}"); // word count
private URL url; private URL url;
private String descr; private String descr;
@ -72,9 +70,9 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
private int size; private int size;
private int wordCount; private int wordCount;
private String snippet; private String snippet;
private indexEntry word; // this is only used if the url is transported via remote search requests private indexRWIEntry word; // this is only used if the url is transported via remote search requests
public plasmaCrawlLURLOldEntry( public indexURLEntryOld(
URL url, URL url,
String descr, String descr,
String author, String author,
@ -114,7 +112,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
this.word = null; this.word = null;
} }
public plasmaCrawlLURLOldEntry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException { public indexURLEntryOld(kelondroRow.Entry entry, indexRWIEntry searchedWord) throws IOException {
try { try {
this.urlHash = entry.getColString(0, null); this.urlHash = entry.getColString(0, null);
this.url = new URL(entry.getColString(1, "UTF-8")); this.url = new URL(entry.getColString(1, "UTF-8"));
@ -138,7 +136,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
} }
} }
public plasmaCrawlLURLOldEntry(Properties prop) { public indexURLEntryOld(Properties prop) {
// generates an plasmaLURLEntry using the properties from the argument // generates an plasmaLURLEntry using the properties from the argument
// the property names must correspond to the one from toString // the property names must correspond to the one from toString
//System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); //System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
@ -161,7 +159,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
this.snippet = prop.getProperty("snippet", ""); this.snippet = prop.getProperty("snippet", "");
if (snippet.length() == 0) snippet = null; if (snippet.length() == 0) snippet = null;
else snippet = crypt.simpleDecode(snippet, null); else snippet = crypt.simpleDecode(snippet, null);
this.word = (prop.containsKey("word")) ? new indexURLEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))) : null; this.word = (prop.containsKey("word")) ? new indexRWIEntryOld(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))) : null;
} catch (Exception e) { } catch (Exception e) {
serverLog.logSevere("PLASMA", serverLog.logSevere("PLASMA",
"INTERNAL ERROR in plasmaLURL.entry/2:" "INTERNAL ERROR in plasmaLURL.entry/2:"
@ -178,8 +176,8 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
} }
public kelondroRow.Entry toRowEntry() throws IOException { public kelondroRow.Entry toRowEntry() throws IOException {
final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, indexURL.urlDateLength); final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength);
final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexURL.urlDateLength); final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength);
final byte[][] entry = new byte[][] { final byte[][] entry = new byte[][] {
urlHash.getBytes(), urlHash.getBytes(),
@ -188,13 +186,13 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
moddatestr.getBytes(), moddatestr.getBytes(),
loaddatestr.getBytes(), loaddatestr.getBytes(),
referrerHash.getBytes(), referrerHash.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(copyCount, indexURL.urlCopyCountLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(copyCount, indexRWIEntryOld.urlCopyCountLength).getBytes(),
flags.getBytes(), flags.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(quality, indexURL.urlQualityLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(quality, indexRWIEntryOld.urlQualityLength).getBytes(),
language.getBytes(), language.getBytes(),
new byte[] { (byte) doctype }, new byte[] { (byte) doctype },
kelondroBase64Order.enhancedCoder.encodeLong(size, indexURL.urlSizeLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(size, indexRWIEntryOld.urlSizeLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(wordCount, indexURL.urlWordCountLength).getBytes()}; kelondroBase64Order.enhancedCoder.encodeLong(wordCount, indexRWIEntryOld.urlWordCountLength).getBytes()};
return rowdef.newEntry(entry); return rowdef.newEntry(entry);
} }
@ -264,11 +262,11 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
return snippet; return snippet;
} }
public indexEntry word() { public indexRWIEntry word() {
return word; return word;
} }
public boolean isOlder(plasmaCrawlLURLEntry other) { public boolean isOlder(indexURLEntry other) {
if (other == null) return false; if (other == null) return false;
if (moddate.before(other.moddate())) return true; if (moddate.before(other.moddate())) return true;
if (moddate.equals(other.moddate())) { if (moddate.equals(other.moddate())) {
@ -292,7 +290,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
",local=").append(((local()) ? "true" : "false")) ",local=").append(((local()) ? "true" : "false"))
.append(",q=").append( .append(",q=").append(
kelondroBase64Order.enhancedCoder.encodeLong( kelondroBase64Order.enhancedCoder.encodeLong(
quality, indexURL.urlQualityLength)) quality, indexRWIEntryOld.urlQualityLength))
.append(",dt=").append(doctype).append(",lang=").append( .append(",dt=").append(doctype).append(",lang=").append(
language).append(",url=").append( language).append(",url=").append(
crypt.simpleEncode(url.toString())).append( crypt.simpleEncode(url.toString())).append(

@ -51,6 +51,7 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlLoaderMessage; import de.anomic.plasma.plasmaCrawlLoaderMessage;
@ -297,7 +298,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
this.name, this.name,
(failreason==null)?"Unknown reason":failreason, (failreason==null)?"Unknown reason":failreason,
new bitfield(indexURL.urlFlagLength) new bitfield(indexRWIEntryOld.urlFlagLength)
); );
// store the entry // store the entry

@ -5,7 +5,7 @@ import java.io.IOException;
import java.util.Iterator; import java.util.Iterator;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexURLEntry; import de.anomic.index.indexRWIEntryOld;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndexAssortment; import de.anomic.plasma.plasmaWordIndexAssortment;
@ -63,7 +63,7 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{
// initializing the import assortment db // initializing the import assortment db
this.log.logInfo("Initializing source assortment file"); this.log.logInfo("Initializing source assortment file");
try { try {
this.assortmentFile = new plasmaWordIndexAssortment(importAssortmentPath, indexURLEntry.urlEntryRow, assortmentNr, this.cacheSize/1024, preloadTime, this.log); this.assortmentFile = new plasmaWordIndexAssortment(importAssortmentPath, indexRWIEntryOld.urlEntryRow, assortmentNr, this.cacheSize/1024, preloadTime, this.log);
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
System.exit(-1); System.exit(-1);

@ -7,10 +7,10 @@ import java.util.Iterator;
import java.util.TreeSet; import java.util.TreeSet;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry; import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.serverDate; import de.anomic.server.serverDate;
@ -134,13 +134,13 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
// loop throug the entities of the container and get the // loop throug the entities of the container and get the
// urlhash // urlhash
Iterator importWordIdxEntries = newContainer.entries(); Iterator importWordIdxEntries = newContainer.entries();
indexEntry importWordIdxEntry; indexRWIEntry importWordIdxEntry;
while (importWordIdxEntries.hasNext()) { while (importWordIdxEntries.hasNext()) {
// testing if import process was aborted // testing if import process was aborted
if (isAborted()) break; if (isAborted()) break;
// getting next word index entry // getting next word index entry
importWordIdxEntry = (indexEntry) importWordIdxEntries.next(); importWordIdxEntry = (indexRWIEntry) importWordIdxEntries.next();
String urlHash = importWordIdxEntry.urlHash(); String urlHash = importWordIdxEntry.urlHash();
entityUrls.add(urlHash); entityUrls.add(urlHash);
} }
@ -162,7 +162,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
// we need to import the url // we need to import the url
// getting the url entry // getting the url entry
plasmaCrawlLURLEntry urlEntry = this.importUrlDB.load(urlHash, null); indexURLEntry urlEntry = this.importUrlDB.load(urlHash, null);
if (urlEntry != null) { if (urlEntry != null) {
/* write it into the home url db */ /* write it into the home url db */

@ -48,10 +48,10 @@ import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroRecords; import de.anomic.kelondro.kelondroRecords;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroStack; import de.anomic.kelondro.kelondroStack;
import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlBalancer { public class plasmaCrawlBalancer {
@ -59,7 +59,7 @@ public class plasmaCrawlBalancer {
private HashMap domainStacks; private HashMap domainStacks;
public plasmaCrawlBalancer(File stackFile) { public plasmaCrawlBalancer(File stackFile) {
stack = kelondroStack.open(stackFile, new kelondroRow("byte[] urlhash-" + indexURL.urlHashLength)); stack = kelondroStack.open(stackFile, new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength));
domainStacks = new HashMap(); domainStacks = new HashMap();
} }

@ -54,12 +54,14 @@ import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedList;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroFlexTable; import de.anomic.kelondro.kelondroFlexTable;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroTree; import de.anomic.kelondro.kelondroTree;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.tools.bitfield; import de.anomic.tools.bitfield;
import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlEURL extends indexURL { public class plasmaCrawlEURL extends indexURL {
@ -134,17 +136,17 @@ public class plasmaCrawlEURL extends indexURL {
public plasmaCrawlEURL(File cachePath, int bufferkb, long preloadTime, boolean newdb) { public plasmaCrawlEURL(File cachePath, int bufferkb, long preloadTime, boolean newdb) {
super(); super();
kelondroRow rowdef = new kelondroRow( kelondroRow rowdef = new kelondroRow(
"String urlhash-" + urlHashLength + ", " + // the url's hash "String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String refhash-" + urlHashLength + ", " + // the url's referrer hash "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"String initiator-" + urlHashLength + ", " + // the crawling initiator "String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"String executor-" + urlHashLength + ", " + // the crawling executor "String executor-" + yacySeedDB.commonHashLength + ", " + // the crawling executor
"String urlstring-" + urlStringLength + ", " + // the url as string "String urlstring-" + indexRWIEntryOld.urlStringLength + ", " + // the url as string
"String urlname-" + urlNameLength + ", " + // the name of the url, from anchor tag <a>name</a> "String urlname-" + indexRWIEntryOld.urlNameLength + ", " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-" + urlDateLength + " {b64e}, " + // the time when the url was first time appeared "Cardinal appdate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // the time when the url was first time appeared
"Cardinal loaddate-" + urlDateLength + " {b64e}, " + // the time when the url was last time tried to load "Cardinal loaddate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // the time when the url was last time tried to load
"Cardinal retrycount-" + urlRetryLength + " {b64e}, " + // number of load retries "Cardinal retrycount-" + indexRWIEntryOld.urlRetryLength + " {b64e}, " + // number of load retries
"String failcause-" + urlErrorLength + ", " + // string describing load failure "String failcause-" + indexRWIEntryOld.urlErrorLength + ", " + // string describing load failure
"byte[] flags-" + urlFlagLength); // extra space "byte[] flags-" + indexRWIEntryOld.urlFlagLength); // extra space
if (newdb) { if (newdb) {
String newCacheName = "urlErr3.table"; String newCacheName = "urlErr3.table";
@ -164,9 +166,9 @@ public class plasmaCrawlEURL extends indexURL {
public synchronized Entry newEntry(URL url, String referrer, String initiator, String executor, public synchronized Entry newEntry(URL url, String referrer, String initiator, String executor,
String name, String failreason, bitfield flags) { String name, String failreason, bitfield flags) {
if ((referrer == null) || (referrer.length() < urlHashLength)) referrer = dummyHash; if ((referrer == null) || (referrer.length() < yacySeedDB.commonHashLength)) referrer = dummyHash;
if ((initiator == null) || (initiator.length() < urlHashLength)) initiator = dummyHash; if ((initiator == null) || (initiator.length() < yacySeedDB.commonHashLength)) initiator = dummyHash;
if ((executor == null) || (executor.length() < urlHashLength)) executor = dummyHash; if ((executor == null) || (executor.length() < yacySeedDB.commonHashLength)) executor = dummyHash;
if (failreason == null) failreason = "unknown"; if (failreason == null) failreason = "unknown";
return new Entry(url, referrer, initiator, executor, name, failreason, flags); return new Entry(url, referrer, initiator, executor, name, failreason, flags);
} }
@ -289,8 +291,8 @@ public class plasmaCrawlEURL extends indexURL {
// stores the values from the object variables into the database // stores the values from the object variables into the database
if (this.stored) return; if (this.stored) return;
if (this.hash == null) return; if (this.hash == null) return;
String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, urlDateLength); String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, indexRWIEntryOld.urlDateLength);
String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, urlDateLength); String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, indexRWIEntryOld.urlDateLength);
// store the hash in the hash cache // store the hash in the hash cache
try { try {
@ -304,7 +306,7 @@ public class plasmaCrawlEURL extends indexURL {
this.name.getBytes(), this.name.getBytes(),
initdatestr.getBytes(), initdatestr.getBytes(),
trydatestr.getBytes(), trydatestr.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.trycount, urlRetryLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(this.trycount, indexRWIEntryOld.urlRetryLength).getBytes(),
this.failreason.getBytes(), this.failreason.getBytes(),
this.flags.getBytes() this.flags.getBytes()
}; };

@ -55,17 +55,18 @@ package de.anomic.plasma;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.text.SimpleDateFormat;
import java.util.Date; import java.util.Date;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.Locale;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.http.httpc.response; import de.anomic.http.httpc.response;
import de.anomic.index.indexEntry; import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.index.indexURLEntryNew;
import de.anomic.index.indexURLEntryOld;
import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroFlexSplitTable; import de.anomic.kelondro.kelondroFlexSplitTable;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
@ -74,12 +75,9 @@ import de.anomic.kelondro.kelondroTree;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCodings; import de.anomic.server.serverCodings;
import de.anomic.server.serverObjects;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield; import de.anomic.tools.bitfield;
import de.anomic.tools.nxTools; import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
public final class plasmaCrawlLURL extends indexURL { public final class plasmaCrawlLURL extends indexURL {
@ -101,11 +99,11 @@ public final class plasmaCrawlLURL extends indexURL {
try { try {
if (newdb) { if (newdb) {
urlIndexFile = new kelondroFlexSplitTable(new File(indexPath, "PUBLIC/TEXT"), "urls", bufferkb * 0x400, preloadTime, plasmaCrawlLURLNewEntry.rowdef, kelondroBase64Order.enhancedCoder); urlIndexFile = new kelondroFlexSplitTable(new File(indexPath, "PUBLIC/TEXT"), "urls", bufferkb * 0x400, preloadTime, indexURLEntryNew.rowdef, kelondroBase64Order.enhancedCoder);
} else { } else {
File oldLURLDB = new File(plasmaPath, "urlHash.db"); File oldLURLDB = new File(plasmaPath, "urlHash.db");
oldLURLDB.getParentFile().mkdirs(); oldLURLDB.getParentFile().mkdirs();
urlIndexFile = new kelondroCache(new kelondroTree(oldLURLDB, bufferkb / 2 * 0x400, preloadTime, plasmaCrawlLURLOldEntry.rowdef), bufferkb / 2 * 0x400, true, false); urlIndexFile = new kelondroCache(new kelondroTree(oldLURLDB, bufferkb / 2 * 0x400, preloadTime, indexURLEntryOld.rowdef), bufferkb / 2 * 0x400, true, false);
} }
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
@ -121,7 +119,7 @@ public final class plasmaCrawlLURL extends indexURL {
gcrawlResultStack = new LinkedList(); gcrawlResultStack = new LinkedList();
} }
public synchronized void stack(plasmaCrawlLURLEntry e, String initiatorHash, String executorHash, int stackType) { public synchronized void stack(indexURLEntry e, String initiatorHash, String executorHash, int stackType) {
if (e == null) { return; } if (e == null) { return; }
try { try {
if (initiatorHash == null) { initiatorHash = dummyHash; } if (initiatorHash == null) { initiatorHash = dummyHash; }
@ -159,7 +157,7 @@ public final class plasmaCrawlLURL extends indexURL {
return 0; return 0;
} }
public synchronized plasmaCrawlLURLEntry load(String urlHash, indexEntry searchedWord) { public synchronized indexURLEntry load(String urlHash, indexRWIEntry searchedWord) {
// generates an plasmaLURLEntry using the url hash // generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered // to speed up the access, the url-hashes are buffered
// in the hash cache. // in the hash cache.
@ -171,17 +169,17 @@ public final class plasmaCrawlLURL extends indexURL {
kelondroRow.Entry entry = urlIndexFile.get(urlHash.getBytes()); kelondroRow.Entry entry = urlIndexFile.get(urlHash.getBytes());
if (entry == null) return null; if (entry == null) return null;
if (newdb) if (newdb)
return new plasmaCrawlLURLNewEntry(entry, searchedWord); return new indexURLEntryNew(entry, searchedWord);
else else
return new plasmaCrawlLURLOldEntry(entry, searchedWord); return new indexURLEntryOld(entry, searchedWord);
} catch (IOException e) { } catch (IOException e) {
return null; return null;
} }
} }
public synchronized void store(plasmaCrawlLURLEntry entry) throws IOException { public synchronized void store(indexURLEntry entry) throws IOException {
// Check if there is a more recent Entry already in the DB // Check if there is a more recent Entry already in the DB
plasmaCrawlLURLEntry oldEntry; indexURLEntry oldEntry;
try { try {
if (exists(entry.hash())) { if (exists(entry.hash())) {
oldEntry = load(entry.hash(), null); oldEntry = load(entry.hash(), null);
@ -202,18 +200,18 @@ public final class plasmaCrawlLURL extends indexURL {
urlIndexFile.put(entry.toRowEntry(), entry.loaddate()); urlIndexFile.put(entry.toRowEntry(), entry.loaddate());
} }
public synchronized plasmaCrawlLURLEntry newEntry(String propStr) { public synchronized indexURLEntry newEntry(String propStr) {
if (propStr.startsWith("{") && propStr.endsWith("}")) { if (propStr.startsWith("{") && propStr.endsWith("}")) {
if (newdb) if (newdb)
return new plasmaCrawlLURLNewEntry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1))); return new indexURLEntryNew(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)));
else else
return new plasmaCrawlLURLOldEntry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1))); return new indexURLEntryOld(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)));
} else { } else {
return null; return null;
} }
} }
public synchronized plasmaCrawlLURLEntry newEntry( public synchronized indexURLEntry newEntry(
URL url, URL url,
String descr, String descr,
String author, String author,
@ -236,10 +234,10 @@ public final class plasmaCrawlLURL extends indexURL {
int lvideo, int lvideo,
int lapp) { int lapp) {
if (newdb) if (newdb)
return new plasmaCrawlLURLNewEntry(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5, return new indexURLEntryNew(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5,
size, wc, dt, flags, lang, llocal, lother, laudio, limage, lvideo, lapp); size, wc, dt, flags, lang, llocal, lother, laudio, limage, lvideo, lapp);
else else
return new plasmaCrawlLURLOldEntry(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5, return new indexURLEntryOld(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5,
size, wc, dt, flags, lang, llocal, lother, laudio, limage, lvideo, lapp); size, wc, dt, flags, lang, llocal, lother, laudio, limage, lvideo, lapp);
} }
@ -257,36 +255,36 @@ public final class plasmaCrawlLURL extends indexURL {
public synchronized String getUrlHash(int stack, int pos) { public synchronized String getUrlHash(int stack, int pos) {
switch (stack) { switch (stack) {
case 1: return ((String) externResultStack.get(pos)).substring(0, urlHashLength); case 1: return ((String) externResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength);
case 2: return ((String) searchResultStack.get(pos)).substring(0, urlHashLength); case 2: return ((String) searchResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength);
case 3: return ((String) transfResultStack.get(pos)).substring(0, urlHashLength); case 3: return ((String) transfResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength);
case 4: return ((String) proxyResultStack.get(pos)).substring(0, urlHashLength); case 4: return ((String) proxyResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength);
case 5: return ((String) lcrawlResultStack.get(pos)).substring(0, urlHashLength); case 5: return ((String) lcrawlResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength);
case 6: return ((String) gcrawlResultStack.get(pos)).substring(0, urlHashLength); case 6: return ((String) gcrawlResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength);
} }
return null; return null;
} }
public synchronized String getInitiatorHash(int stack, int pos) { public synchronized String getInitiatorHash(int stack, int pos) {
switch (stack) { switch (stack) {
case 1: return ((String) externResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2); case 1: return ((String) externResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2);
case 2: return ((String) searchResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2); case 2: return ((String) searchResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2);
case 3: return ((String) transfResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2); case 3: return ((String) transfResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2);
case 4: return ((String) proxyResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2); case 4: return ((String) proxyResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2);
case 5: return ((String) lcrawlResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2); case 5: return ((String) lcrawlResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2);
case 6: return ((String) gcrawlResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2); case 6: return ((String) gcrawlResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2);
} }
return null; return null;
} }
public synchronized String getExecutorHash(int stack, int pos) { public synchronized String getExecutorHash(int stack, int pos) {
switch (stack) { switch (stack) {
case 1: return ((String) externResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3); case 1: return ((String) externResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3);
case 2: return ((String) searchResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3); case 2: return ((String) searchResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3);
case 3: return ((String) transfResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3); case 3: return ((String) transfResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3);
case 4: return ((String) proxyResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3); case 4: return ((String) proxyResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3);
case 5: return ((String) lcrawlResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3); case 5: return ((String) lcrawlResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3);
case 6: return ((String) gcrawlResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3); case 6: return ((String) gcrawlResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3);
} }
return null; return null;
} }
@ -342,87 +340,9 @@ public final class plasmaCrawlLURL extends indexURL {
} }
} }
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); public Iterator entries(boolean up, boolean rotating, String firstHash) throws IOException {
private static String daydate(Date date) { // enumerates entry elements
if (date == null) { return new kiter(up, rotating, firstHash);
return "";
} else {
return dayFormatter.format(date);
}
}
public serverObjects genTableProps(int tabletype, int lines, boolean showInit, boolean showExec, String dfltInit, String dfltExec, String feedbackpage, boolean makeLink) {
/* serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps tabletype=" + tabletype + " lines=" + lines +
" showInit=" + showInit + " showExec=" + showExec +
" dfltInit=" + dfltInit + " dfltExec=" + dfltExec +
" feedbackpage=" + feedbackpage + " makeLink=" + makeLink); */
final serverObjects prop = new serverObjects();
if (getStackSize(tabletype) == 0) {
prop.put("table", 0);
return prop;
}
prop.put("table", 1);
if (lines > getStackSize(tabletype)) lines = getStackSize(tabletype);
if (lines == getStackSize(tabletype)) {
prop.put("table_size", 0);
} else {
prop.put("table_size", 1);
prop.put("table_size_count", lines);
}
prop.put("table_size_all", getStackSize(tabletype));
prop.put("table_feedbackpage", feedbackpage);
prop.put("table_tabletype", tabletype);
prop.put("table_showInit", (showInit) ? 1 : 0);
prop.put("table_showExec", (showExec) ? 1 : 0);
boolean dark = true;
String urlHash, initiatorHash, executorHash;
String cachepath, urlstr, urltxt;
yacySeed initiatorSeed, executorSeed;
plasmaCrawlLURLEntry urle;
// needed for getCachePath(url)
final plasmaSwitchboard switchboard = plasmaSwitchboard.getSwitchboard();
final plasmaHTCache cacheManager = switchboard.getCacheManager();
int i, cnt = 0;
for (i = getStackSize(tabletype) - 1; i >= (getStackSize(tabletype) - lines); i--) {
initiatorHash = getInitiatorHash(tabletype, i);
executorHash = getExecutorHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps initiatorHash=" + initiatorHash + " executorHash=" + executorHash);
urlHash = getUrlHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash);
try {
urle = load(urlHash, null);
plasmaCrawlLURLEntry.Components comp = urle.comp();
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString());
initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash);
executorSeed = yacyCore.seedDB.getConnected(executorHash);
urlstr = comp.url().toNormalform();
urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
cachepath = cacheManager.getCachePath(new URL(urlstr)).toString().replace('\\', '/').substring(cacheManager.cachePath.toString().length() + 1);
prop.put("table_indexed_" + cnt + "_dark", (dark) ? 1 : 0);
prop.put("table_indexed_" + cnt + "_feedbackpage", feedbackpage);
prop.put("table_indexed_" + cnt + "_tabletype", tabletype);
prop.put("table_indexed_" + cnt + "_urlhash", urlHash);
prop.put("table_indexed_" + cnt + "_showInit", (showInit) ? 1 : 0);
prop.put("table_indexed_" + cnt + "_showInit_initiatorSeed", (initiatorSeed == null) ? dfltInit : initiatorSeed.getName());
prop.put("table_indexed_" + cnt + "_showExec", (showExec) ? 1 : 0);
prop.put("table_indexed_" + cnt + "_showExec_executorSeed", (executorSeed == null) ? dfltExec : executorSeed.getName());
prop.put("table_indexed_" + cnt + "_moddate", daydate(urle.moddate()));
prop.put("table_indexed_" + cnt + "_wordcount", urle.wordCount());
prop.put("table_indexed_" + cnt + "_urldescr", comp.descr());
prop.put("table_indexed_" + cnt + "_url", (cachepath == null) ? "-not-cached-" : ((makeLink) ? ("<a href=\"CacheAdmin_p.html?action=info&path=" + cachepath + "\" class=\"small\" title=\"" + urlstr + "\">" + urltxt + "</a>") : urlstr));
dark = !dark;
cnt++;
} catch (Exception e) {
serverLog.logSevere("PLASMA", "genTableProps", e);
}
}
prop.put("table_indexed", cnt);
return prop;
} }
public class kiter implements Iterator { public class kiter implements Iterator {
@ -445,9 +365,9 @@ public final class plasmaCrawlLURL extends indexURL {
if (e == null) return null; if (e == null) return null;
try { try {
if (newdb) if (newdb)
return new plasmaCrawlLURLNewEntry(e, null); return new indexURLEntryNew(e, null);
else else
return new plasmaCrawlLURLOldEntry(e, null); return new indexURLEntryOld(e, null);
} catch (IOException ex) { } catch (IOException ex) {
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null)); throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null));
} }
@ -459,11 +379,6 @@ public final class plasmaCrawlLURL extends indexURL {
} }
public Iterator entries(boolean up, boolean rotating, String firstHash) throws IOException {
// enumerates entry elements
return new kiter(up, rotating, firstHash);
}
/** /**
* Uses an Iteration over urlHash.db to detect malformed URL-Entries. * Uses an Iteration over urlHash.db to detect malformed URL-Entries.
* Damaged URL-Entries will be marked in a HashSet and removed at the end of the function. * Damaged URL-Entries will be marked in a HashSet and removed at the end of the function.
@ -578,8 +493,8 @@ public final class plasmaCrawlLURL extends indexURL {
} }
} }
plasmaCrawlLURLEntry entry = (plasmaCrawlLURLEntry) eiter.next(); indexURLEntry entry = (indexURLEntry) eiter.next();
plasmaCrawlLURLEntry.Components comp = entry.comp(); indexURLEntry.Components comp = entry.comp();
totalSearchedUrls++; totalSearchedUrls++;
if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, comp.url()) || if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, comp.url()) ||
plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, comp.url())) { plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, comp.url())) {
@ -650,7 +565,7 @@ public final class plasmaCrawlLURL extends indexURL {
final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), new File(args[2]), 1, 0, false); final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), new File(args[2]), 1, 0, false);
final Iterator enu = urls.entries(true, false, null); final Iterator enu = urls.entries(true, false, null);
while (enu.hasNext()) { while (enu.hasNext()) {
System.out.println(((plasmaCrawlLURLEntry) enu.next()).toString()); System.out.println(((indexURLEntry) enu.next()).toString());
} }
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();

@ -51,6 +51,7 @@ import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
@ -62,6 +63,7 @@ import de.anomic.kelondro.kelondroTree;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield; import de.anomic.tools.bitfield;
import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlNURL extends indexURL { public class plasmaCrawlNURL extends indexURL {
@ -78,18 +80,18 @@ public class plasmaCrawlNURL extends indexURL {
* column length definition for the {@link plasmaURL#urlIndexFile} DB * column length definition for the {@link plasmaURL#urlIndexFile} DB
*/ */
public final static kelondroRow rowdef = new kelondroRow( public final static kelondroRow rowdef = new kelondroRow(
"String urlhash-" + urlHashLength + ", " + // the url's hash "String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String initiator-" + urlHashLength + ", " + // the crawling initiator "String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"String urlstring-" + urlStringLength + ", " + // the url as string "String urlstring-" + indexRWIEntryOld.urlStringLength + ", " + // the url as string
"String refhash-" + urlHashLength + ", " + // the url's referrer hash "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"String urlname-" + urlNameLength + ", " + // the name of the url, from anchor tag <a>name</a> "String urlname-" + indexRWIEntryOld.urlNameLength + ", " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-" + urlDateLength + " {b64e}, " + // the time when the url was first time appeared "Cardinal appdate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // the time when the url was first time appeared
"String profile-" + urlCrawlProfileHandleLength + ", " + // the name of the prefetch profile handle "String profile-" + indexRWIEntryOld.urlCrawlProfileHandleLength + ", " + // the name of the prefetch profile handle
"Cardinal depth-" + urlCrawlDepthLength + " {b64e}, " + // the prefetch depth so far, starts at 0 "Cardinal depth-" + indexRWIEntryOld.urlCrawlDepthLength + " {b64e}, " + // the prefetch depth so far, starts at 0
"Cardinal parentbr-" + urlParentBranchesLength + " {b64e}, " + // number of anchors of the parent "Cardinal parentbr-" + indexRWIEntryOld.urlParentBranchesLength + " {b64e}, " + // number of anchors of the parent
"Cardinal forkfactor-" + urlForkFactorLength + " {b64e}, " + // sum of anchors of all ancestors "Cardinal forkfactor-" + indexRWIEntryOld.urlForkFactorLength + " {b64e}, " + // sum of anchors of all ancestors
"byte[] flags-" + urlFlagLength + ", " + // flags "byte[] flags-" + indexRWIEntryOld.urlFlagLength + ", " + // flags
"String handle-" + urlHandleLength); // extra handle "String handle-" + indexRWIEntryOld.urlHandleLength); // extra handle
private final plasmaCrawlBalancer coreStack; // links found by crawling to depth-1 private final plasmaCrawlBalancer coreStack; // links found by crawling to depth-1
private final plasmaCrawlBalancer limitStack; // links found by crawling at target depth private final plasmaCrawlBalancer limitStack; // links found by crawling at target depth
@ -128,7 +130,7 @@ public class plasmaCrawlNURL extends indexURL {
limitStack = new plasmaCrawlBalancer(limitStackFile); limitStack = new plasmaCrawlBalancer(limitStackFile);
overhangStack = new plasmaCrawlBalancer(overhangStackFile); overhangStack = new plasmaCrawlBalancer(overhangStackFile);
remoteStack = new plasmaCrawlBalancer(remoteStackFile); remoteStack = new plasmaCrawlBalancer(remoteStackFile);
kelondroRow rowdef = new kelondroRow("byte[] urlhash-" + indexURL.urlHashLength); kelondroRow rowdef = new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength);
imageStack = kelondroStack.open(imageStackFile, rowdef); imageStack = kelondroStack.open(imageStackFile, rowdef);
movieStack = kelondroStack.open(movieStackFile, rowdef); movieStack = kelondroStack.open(movieStackFile, rowdef);
musicStack = kelondroStack.open(musicStackFile, rowdef); musicStack = kelondroStack.open(musicStackFile, rowdef);
@ -257,7 +259,7 @@ public class plasmaCrawlNURL extends indexURL {
private static String normalizeHandle(int h) { private static String normalizeHandle(int h) {
String d = Integer.toHexString(h); String d = Integer.toHexString(h);
while (d.length() < urlHandleLength) d = "0" + d; while (d.length() < indexRWIEntryOld.urlHandleLength) d = "0" + d;
return d; return d;
} }
@ -479,7 +481,7 @@ public class plasmaCrawlNURL extends indexURL {
this.depth = depth; this.depth = depth;
this.anchors = anchors; this.anchors = anchors;
this.forkfactor = forkfactor; this.forkfactor = forkfactor;
this.flags = new bitfield(urlFlagLength); this.flags = new bitfield(indexRWIEntryOld.urlFlagLength);
this.handle = 0; this.handle = 0;
this.stored = false; this.stored = false;
} }
@ -533,7 +535,7 @@ public class plasmaCrawlNURL extends indexURL {
public void store() { public void store() {
// stores the values from the object variables into the database // stores the values from the object variables into the database
if (this.stored) return; if (this.stored) return;
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength); String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength);
// store the hash in the hash cache // store the hash in the hash cache
try { try {
// even if the entry exists, we simply overwrite it // even if the entry exists, we simply overwrite it
@ -545,9 +547,9 @@ public class plasmaCrawlNURL extends indexURL {
this.name.getBytes("UTF-8"), this.name.getBytes("UTF-8"),
loaddatestr.getBytes(), loaddatestr.getBytes(),
(this.profileHandle == null) ? null : this.profileHandle.getBytes(), (this.profileHandle == null) ? null : this.profileHandle.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.depth, urlCrawlDepthLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(this.depth, indexRWIEntryOld.urlCrawlDepthLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, urlParentBranchesLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, indexRWIEntryOld.urlParentBranchesLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, urlForkFactorLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, indexRWIEntryOld.urlForkFactorLength).getBytes(),
this.flags.getBytes(), this.flags.getBytes(),
normalizeHandle(this.handle).getBytes() normalizeHandle(this.handle).getBytes()
}; };

@ -48,7 +48,7 @@ import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import de.anomic.index.indexURL; import de.anomic.index.indexRWIEntryOld;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroDyn; import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
@ -68,7 +68,7 @@ public class plasmaCrawlProfile {
this.bufferkb = bufferkb; this.bufferkb = bufferkb;
this.preloadTime = preloadTime; this.preloadTime = preloadTime;
profileTableFile.getParentFile().mkdirs(); profileTableFile.getParentFile().mkdirs();
kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, indexURL.urlCrawlProfileHandleLength, 2000, '#'); kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, indexRWIEntryOld.urlCrawlProfileHandleLength, 2000, '#');
profileTable = new kelondroMap(dyn); profileTable = new kelondroMap(dyn);
domsCache = new HashMap(); domsCache = new HashMap();
} }
@ -94,7 +94,7 @@ public class plasmaCrawlProfile {
if (profileTable != null) try { profileTable.close(); } catch (IOException e) {} if (profileTable != null) try { profileTable.close(); } catch (IOException e) {}
if (!(profileTableFile.delete())) throw new RuntimeException("cannot delete crawl profile database"); if (!(profileTableFile.delete())) throw new RuntimeException("cannot delete crawl profile database");
profileTableFile.getParentFile().mkdirs(); profileTableFile.getParentFile().mkdirs();
kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, indexURL.urlCrawlProfileHandleLength, 2000, '#'); kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, indexRWIEntryOld.urlCrawlProfileHandleLength, 2000, '#');
profileTable = new kelondroMap(dyn); profileTable = new kelondroMap(dyn);
} }
@ -256,7 +256,7 @@ public class plasmaCrawlProfile {
boolean storeHTCache, boolean storeTXCache, boolean storeHTCache, boolean storeTXCache,
boolean localIndexing, boolean remoteIndexing, boolean localIndexing, boolean remoteIndexing,
boolean xsstopw, boolean xdstopw, boolean xpstopw) { boolean xsstopw, boolean xdstopw, boolean xpstopw) {
String handle = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, indexURL.urlCrawlProfileHandleLength); String handle = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, indexRWIEntryOld.urlCrawlProfileHandleLength);
mem = new HashMap(); mem = new HashMap();
mem.put("handle", handle); mem.put("handle", handle);
mem.put("name", name); mem.put("name", name);

@ -60,6 +60,8 @@ import org.apache.commons.pool.impl.GenericObjectPool;
import de.anomic.data.robotsParser; import de.anomic.data.robotsParser;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
@ -391,7 +393,7 @@ public final class plasmaCrawlStacker {
checkInterruption(); checkInterruption();
String nexturlhash = indexURL.urlHash(nexturl); String nexturlhash = indexURL.urlHash(nexturl);
String dbocc = this.sb.urlPool.exists(nexturlhash); String dbocc = this.sb.urlPool.exists(nexturlhash);
plasmaCrawlLURLEntry oldEntry = null; indexURLEntry oldEntry = null;
oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null); oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null);
boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder()); boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());
if ((dbocc != null) && (!(recrawl))) { if ((dbocc != null) && (!(recrawl))) {
@ -490,7 +492,7 @@ public final class plasmaCrawlStacker {
this.depth = depth; this.depth = depth;
this.anchors = anchors; this.anchors = anchors;
this.forkfactor = forkfactor; this.forkfactor = forkfactor;
this.flags = new bitfield(indexURL.urlFlagLength); this.flags = new bitfield(indexRWIEntryOld.urlFlagLength);
this.handle = 0; this.handle = 0;
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
@ -573,7 +575,7 @@ public final class plasmaCrawlStacker {
public byte[][] getBytes() { public byte[][] getBytes() {
// stores the values from the object variables into the database // stores the values from the object variables into the database
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexURL.urlDateLength); String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength);
// store the hash in the hash cache // store the hash in the hash cache
// even if the entry exists, we simply overwrite it // even if the entry exists, we simply overwrite it
@ -587,9 +589,9 @@ public final class plasmaCrawlStacker {
this.name.getBytes("UTF-8"), this.name.getBytes("UTF-8"),
loaddatestr.getBytes(), loaddatestr.getBytes(),
(this.profileHandle == null) ? null : this.profileHandle.getBytes(), (this.profileHandle == null) ? null : this.profileHandle.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.depth, indexURL.urlCrawlDepthLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(this.depth, indexRWIEntryOld.urlCrawlDepthLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, indexURL.urlParentBranchesLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, indexRWIEntryOld.urlParentBranchesLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, indexURL.urlForkFactorLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, indexRWIEntryOld.urlForkFactorLength).getBytes(),
this.flags.getBytes(), this.flags.getBytes(),
normalizeHandle(this.handle).getBytes() normalizeHandle(this.handle).getBytes()
}; };
@ -599,7 +601,7 @@ public final class plasmaCrawlStacker {
private String normalizeHandle(int h) { private String normalizeHandle(int h) {
String d = Integer.toHexString(h); String d = Integer.toHexString(h);
while (d.length() < indexURL.urlHandleLength) d = "0" + d; while (d.length() < indexRWIEntryOld.urlHandleLength) d = "0" + d;
return d; return d;
} }
} }
@ -1057,7 +1059,7 @@ public final class plasmaCrawlStacker {
yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
this.theMsg.name, this.theMsg.name,
rejectReason, rejectReason,
new bitfield(indexURL.urlFlagLength) new bitfield(indexRWIEntryOld.urlFlagLength)
); );
ee.store(); ee.store();
sb.urlPool.errorURL.stackPushEntry(ee); sb.urlPool.errorURL.stackPushEntry(ee);

@ -48,7 +48,8 @@ import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry; import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
import de.anomic.server.serverCodings; import de.anomic.server.serverCodings;
@ -200,8 +201,8 @@ public class plasmaDHTChunk {
Iterator indexContainerIterator = wordIndex.indexContainerSet(hash, resourceLevel, true, maxcount).iterator(); Iterator indexContainerIterator = wordIndex.indexContainerSet(hash, resourceLevel, true, maxcount).iterator();
indexContainer container; indexContainer container;
Iterator urlIter; Iterator urlIter;
indexEntry iEntry; indexRWIEntry iEntry;
plasmaCrawlLURLEntry lurl; indexURLEntry lurl;
int refcount = 0; int refcount = 0;
int wholesize; int wholesize;
@ -227,7 +228,7 @@ public class plasmaDHTChunk {
urlIter = container.entries(); urlIter = container.entries();
// iterate over indexes to fetch url entries and store them in the urlCache // iterate over indexes to fetch url entries and store them in the urlCache
while ((urlIter.hasNext()) && (maxcount > refcount) && (System.currentTimeMillis() < timeout)) { while ((urlIter.hasNext()) && (maxcount > refcount) && (System.currentTimeMillis() < timeout)) {
iEntry = (indexEntry) urlIter.next(); iEntry = (indexRWIEntry) urlIter.next();
lurl = lurls.load(iEntry.urlHash(), iEntry); lurl = lurls.load(iEntry.urlHash(), iEntry);
if ((lurl == null) || (lurl.comp().url() == null)) { if ((lurl == null) || (lurl.comp().url() == null)) {
//yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash()); //yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash());
@ -243,7 +244,7 @@ public class plasmaDHTChunk {
// remove all remaining; we have enough // remove all remaining; we have enough
while (urlIter.hasNext()) { while (urlIter.hasNext()) {
iEntry = (indexEntry) urlIter.next(); iEntry = (indexRWIEntry) urlIter.next();
urlIter.remove(); urlIter.remove();
} }
@ -285,7 +286,7 @@ public class plasmaDHTChunk {
public synchronized String deleteTransferIndexes() { public synchronized String deleteTransferIndexes() {
Iterator urlIter; Iterator urlIter;
indexEntry iEntry; indexRWIEntry iEntry;
HashSet urlHashes; HashSet urlHashes;
String count = "0"; String count = "0";
@ -299,7 +300,7 @@ public class plasmaDHTChunk {
urlHashes = new HashSet(this.indexContainers[i].size()); urlHashes = new HashSet(this.indexContainers[i].size());
urlIter = this.indexContainers[i].entries(); urlIter = this.indexContainers[i].entries();
while (urlIter.hasNext()) { while (urlIter.hasNext()) {
iEntry = (indexEntry) urlIter.next(); iEntry = (indexRWIEntry) urlIter.next();
urlHashes.add(iEntry.urlHash()); urlHashes.add(iEntry.urlHash());
} }
String wordHash = indexContainers[i].getWordHash(); String wordHash = indexContainers[i].getWordHash();

@ -90,6 +90,7 @@ import de.anomic.server.serverThread;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.tools.enumerateFiles; import de.anomic.tools.enumerateFiles;
import de.anomic.yacy.yacySeed; import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacySeedDB;
public final class plasmaHTCache { public final class plasmaHTCache {
@ -173,7 +174,7 @@ public final class plasmaHTCache {
// open the response header database // open the response header database
File dbfile = new File(this.cachePath, "responseHeader.db"); File dbfile = new File(this.cachePath, "responseHeader.db");
try { try {
this.responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400, preloadTime, indexURL.urlHashLength, 150, '#')); this.responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400, preloadTime, yacySeedDB.commonHashLength, 150, '#'));
} catch (IOException e) { } catch (IOException e) {
this.log.logSevere("the request header database could not be opened: " + e.getMessage()); this.log.logSevere("the request header database could not be opened: " + e.getMessage());
System.exit(0); System.exit(0);
@ -717,7 +718,7 @@ public final class plasmaHTCache {
if (hexHash.indexOf('.') >= 0) return null; if (hexHash.indexOf('.') >= 0) return null;
try { try {
String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.decodeHex(hexHash)); String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.decodeHex(hexHash));
if (hash.length() == indexURL.urlHashLength) return hash; if (hash.length() == yacySeedDB.commonHashLength) return hash;
return null; return null;
} catch (Exception e) { } catch (Exception e) {
//log.logWarning("getHash: " + e.getMessage(), e); //log.logWarning("getHash: " + e.getMessage(), e);

@ -51,7 +51,8 @@ import java.util.Set;
import java.util.TreeMap; import java.util.TreeMap;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry; import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMSetTools; import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
@ -379,8 +380,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
//if (searchResult == null) return acc; // strange case where searchResult is not proper: acc is then empty //if (searchResult == null) return acc; // strange case where searchResult is not proper: acc is then empty
//if (searchResult.size() == 0) return acc; // case that we have nothing to do //if (searchResult.size() == 0) return acc; // case that we have nothing to do
indexEntry entry; indexRWIEntry entry;
plasmaCrawlLURLEntry page; indexURLEntry page;
Long preranking; Long preranking;
Object[] preorderEntry; Object[] preorderEntry;
int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT); int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT);
@ -388,7 +389,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
while (preorder.hasNext()) { while (preorder.hasNext()) {
if ((System.currentTimeMillis() >= postorderLimitTime) && (acc.sizeFetched() >= minEntries)) break; if ((System.currentTimeMillis() >= postorderLimitTime) && (acc.sizeFetched() >= minEntries)) break;
preorderEntry = preorder.next(); preorderEntry = preorder.next();
entry = (indexEntry) preorderEntry[0]; entry = (indexRWIEntry) preorderEntry[0];
// load only urls if there was not yet a root url of that hash // load only urls if there was not yet a root url of that hash
preranking = (Long) preorderEntry[1]; preranking = (Long) preorderEntry[1];
// find the url entry // find the url entry
@ -425,11 +426,11 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
preorder.remove(true, true); preorder.remove(true, true);
// start url-fetch // start url-fetch
indexEntry entry; indexRWIEntry entry;
try { try {
while (preorder.hasNext()) { while (preorder.hasNext()) {
if (System.currentTimeMillis() >= timeout) break; if (System.currentTimeMillis() >= timeout) break;
entry = (indexEntry) (preorder.next()[0]); entry = (indexRWIEntry) (preorder.next()[0]);
// find and fetch the url entry // find and fetch the url entry
urlStore.load(entry.urlHash(), entry); urlStore.load(entry.urlHash(), entry);
} }

@ -48,6 +48,7 @@ import java.util.Map;
import java.util.TreeSet; import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.parser.ParserException; import de.anomic.plasma.parser.ParserException;
import de.anomic.server.serverDate; import de.anomic.server.serverDate;
@ -101,7 +102,7 @@ public final class plasmaSearchImages {
public plasmaSearchImages(plasmaSnippetCache sc, long maxTime, plasmaSearchResult sres, int depth) { public plasmaSearchImages(plasmaSnippetCache sc, long maxTime, plasmaSearchResult sres, int depth) {
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
this.images = new TreeSet(); this.images = new TreeSet();
plasmaCrawlLURLEntry urlentry; indexURLEntry urlentry;
while (sres.hasMoreElements()) { while (sres.hasMoreElements()) {
urlentry = sres.nextElement(); urlentry = sres.nextElement();
addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), urlentry.comp().url(), depth)); addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), urlentry.comp().url(), depth));

@ -50,7 +50,7 @@ import java.util.Map;
import java.util.TreeMap; import java.util.TreeMap;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry; import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroBinSearch; import de.anomic.kelondro.kelondroBinSearch;
import de.anomic.server.serverCodings; import de.anomic.server.serverCodings;
@ -61,7 +61,7 @@ public final class plasmaSearchPreOrder {
public static kelondroBinSearch[] ybrTables = null; // block-rank tables public static kelondroBinSearch[] ybrTables = null; // block-rank tables
private static boolean useYBR = true; private static boolean useYBR = true;
private indexEntry entryMin, entryMax; private indexRWIEntry entryMin, entryMax;
private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
private plasmaSearchQuery query; private plasmaSearchQuery query;
private plasmaSearchRankingProfile ranking; private plasmaSearchRankingProfile ranking;
@ -79,7 +79,7 @@ public final class plasmaSearchPreOrder {
this.ranking = ranking; this.ranking = ranking;
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
indexEntry iEntry; indexRWIEntry iEntry;
// first pass: find min/max to obtain limits for normalization // first pass: find min/max to obtain limits for normalization
Iterator i = container.entries(); Iterator i = container.entries();
@ -88,9 +88,9 @@ public final class plasmaSearchPreOrder {
this.entryMax = null; this.entryMax = null;
while (i.hasNext()) { while (i.hasNext()) {
if (System.currentTimeMillis() > limitTime) break; if (System.currentTimeMillis() > limitTime) break;
iEntry = (indexEntry) i.next(); iEntry = (indexRWIEntry) i.next();
if (this.entryMin == null) this.entryMin = (indexEntry) iEntry.clone(); else this.entryMin.min(iEntry); if (this.entryMin == null) this.entryMin = (indexRWIEntry) iEntry.clone(); else this.entryMin.min(iEntry);
if (this.entryMax == null) this.entryMax = (indexEntry) iEntry.clone(); else this.entryMax.max(iEntry); if (this.entryMax == null) this.entryMax = (indexRWIEntry) iEntry.clone(); else this.entryMax.max(iEntry);
count++; count++;
} }
@ -98,7 +98,7 @@ public final class plasmaSearchPreOrder {
i = container.entries(); i = container.entries();
this.pageAcc = new TreeMap(); this.pageAcc = new TreeMap();
for (int j = 0; j < count; j++) { for (int j = 0; j < count; j++) {
iEntry = (indexEntry) i.next(); iEntry = (indexRWIEntry) i.next();
pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), query.words("")), 16) + iEntry.urlHash(), iEntry); pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), query.words("")), 16) + iEntry.urlHash(), iEntry);
} }
} }
@ -110,13 +110,13 @@ public final class plasmaSearchPreOrder {
HashSet doubleDoms = new HashSet(); HashSet doubleDoms = new HashSet();
Iterator i = pageAcc.entrySet().iterator(); Iterator i = pageAcc.entrySet().iterator();
Map.Entry entry; Map.Entry entry;
indexEntry iEntry; indexRWIEntry iEntry;
String hashpart; String hashpart;
boolean isWordRootURL; boolean isWordRootURL;
while (i.hasNext()) { while (i.hasNext()) {
if (pageAcc.size() <= query.wantedResults) break; if (pageAcc.size() <= query.wantedResults) break;
entry = (Map.Entry) i.next(); entry = (Map.Entry) i.next();
iEntry = (indexEntry) entry.getValue(); iEntry = (indexRWIEntry) entry.getValue();
hashpart = iEntry.urlHash().substring(6); hashpart = iEntry.urlHash().substring(6);
isWordRootURL = indexURL.isWordRootURL(iEntry.urlHash(), query.words("")); isWordRootURL = indexURL.isWordRootURL(iEntry.urlHash(), query.words(""));
if ((!(isWordRootURL)) && if ((!(isWordRootURL)) &&
@ -192,11 +192,11 @@ public final class plasmaSearchPreOrder {
e.printStackTrace(); e.printStackTrace();
preranking = new Long(0); preranking = new Long(0);
} }
return new Object[]{(indexEntry) pageAcc.remove(top), preranking}; return new Object[]{(indexRWIEntry) pageAcc.remove(top), preranking};
} }
public indexEntry[] getNormalizer() { public indexRWIEntry[] getNormalizer() {
return new indexEntry[] {entryMin, entryMax}; return new indexRWIEntry[] {entryMin, entryMax};
} }
public static int ybr_p(String urlHash) { public static int ybr_p(String urlHash) {

@ -51,6 +51,7 @@ import de.anomic.htmlFilter.htmlFilterAbstractScraper;
import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.server.serverCharBuffer; import de.anomic.server.serverCharBuffer;
import de.anomic.yacy.yacySeedDB;
public final class plasmaSearchQuery { public final class plasmaSearchQuery {
@ -120,16 +121,16 @@ public final class plasmaSearchQuery {
public static Set hashes2Set(String query) { public static Set hashes2Set(String query) {
if (query == null) return new HashSet(); if (query == null) return new HashSet();
final HashSet keyhashes = new HashSet(query.length() / indexEntryAttribute.wordHashLength); final HashSet keyhashes = new HashSet(query.length() / yacySeedDB.commonHashLength);
for (int i = 0; i < (query.length() / indexEntryAttribute.wordHashLength); i++) { for (int i = 0; i < (query.length() / yacySeedDB.commonHashLength); i++) {
keyhashes.add(query.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength)); keyhashes.add(query.substring(i * yacySeedDB.commonHashLength, (i + 1) * yacySeedDB.commonHashLength));
} }
return keyhashes; return keyhashes;
} }
public static String hashSet2hashString(Set words) { public static String hashSet2hashString(Set words) {
Iterator i = words.iterator(); Iterator i = words.iterator();
StringBuffer sb = new StringBuffer(words.size() * indexEntryAttribute.wordHashLength); StringBuffer sb = new StringBuffer(words.size() * yacySeedDB.commonHashLength);
while (i.hasNext()) sb.append((String) i.next()); while (i.hasNext()) sb.append((String) i.next());
return new String(sb); return new String(sb);
} }

@ -46,8 +46,9 @@ import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import de.anomic.index.indexEntry; import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
public class plasmaSearchRankingProfile { public class plasmaSearchRankingProfile {
@ -164,7 +165,7 @@ public class plasmaSearchRankingProfile {
return new String(ext); return new String(ext);
} }
public long preRanking(indexEntry normalizedEntry, String searchedWord) { public long preRanking(indexRWIEntry normalizedEntry, String searchedWord) {
// the normalizedEntry must be a normalized indexEntry // the normalizedEntry must be a normalized indexEntry
long ranking = 0; long ranking = 0;
ranking += normalizedEntry.quality() << ((Integer) coeff.get(ENTROPY)).intValue(); ranking += normalizedEntry.quality() << ((Integer) coeff.get(ENTROPY)).intValue();
@ -191,13 +192,13 @@ public class plasmaSearchRankingProfile {
Set topwords, Set topwords,
String[] urlcomps, String[] urlcomps,
String[] descrcomps, String[] descrcomps,
plasmaCrawlLURLEntry page) { indexURLEntry page) {
// apply pre-calculated order attributes // apply pre-calculated order attributes
long ranking = preranking; long ranking = preranking;
// prefer hit with 'prefer' pattern // prefer hit with 'prefer' pattern
plasmaCrawlLURLEntry.Components comp = page.comp(); indexURLEntry.Components comp = page.comp();
if (comp.url().toNormalform().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue(); if (comp.url().toNormalform().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();
if (comp.descr().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue(); if (comp.descr().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();

@ -54,6 +54,7 @@ import java.util.TreeMap;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.server.serverCodings; import de.anomic.server.serverCodings;
@ -99,16 +100,16 @@ public final class plasmaSearchResult {
return pageAcc.size() > 0; return pageAcc.size() > 0;
} }
public plasmaCrawlLURLEntry nextElement() { public indexURLEntry nextElement() {
Object top = pageAcc.firstKey(); Object top = pageAcc.firstKey();
//System.out.println("postorder-key: " + ((String) top)); //System.out.println("postorder-key: " + ((String) top));
return (plasmaCrawlLURLEntry) pageAcc.remove(top); return (indexURLEntry) pageAcc.remove(top);
} }
protected void addResult(plasmaCrawlLURLEntry page, Long preranking) { protected void addResult(indexURLEntry page, Long preranking) {
// take out relevant information for reference computation // take out relevant information for reference computation
plasmaCrawlLURLEntry.Components comp = page.comp(); indexURLEntry.Components comp = page.comp();
if ((comp.url() == null) || (comp.descr() == null)) return; if ((comp.url() == null) || (comp.descr() == null)) return;
String[] urlcomps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()); // word components of the url String[] urlcomps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()); // word components of the url
String[] descrcomps = comp.descr().toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description String[] descrcomps = comp.descr().toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description
@ -131,12 +132,12 @@ public final class plasmaSearchResult {
for (int i = 0; i < references.length; i++) commonSense.add(references[i]); for (int i = 0; i < references.length; i++) commonSense.add(references[i]);
Object[] resultVector; Object[] resultVector;
plasmaCrawlLURLEntry page; indexURLEntry page;
long ranking; long ranking;
for (int i = 0; i < results.size(); i++) { for (int i = 0; i < results.size(); i++) {
// take out values from result array // take out values from result array
resultVector = (Object[]) results.get(i); resultVector = (Object[]) results.get(i);
page = (plasmaCrawlLURLEntry) resultVector[0]; page = (indexURLEntry) resultVector[0];
// calculate ranking // calculate ranking
if (postsort) if (postsort)
@ -172,7 +173,7 @@ public final class plasmaSearchResult {
// first scan all entries and find all urls that are referenced // first scan all entries and find all urls that are referenced
while (i.hasNext()) { while (i.hasNext()) {
entry = (Map.Entry) i.next(); entry = (Map.Entry) i.next();
path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).comp().url()); path = urlPath(((indexURLEntry) entry.getValue()).comp().url());
paths.put(path, entry.getKey()); paths.put(path, entry.getKey());
//if (path != null) path = shortenPath(path); //if (path != null) path = shortenPath(path);
//if (path != null) paths.put(path, entry.getKey()); //if (path != null) paths.put(path, entry.getKey());
@ -183,7 +184,7 @@ public final class plasmaSearchResult {
String shorten; String shorten;
while (i.hasNext()) { while (i.hasNext()) {
entry = (Map.Entry) i.next(); entry = (Map.Entry) i.next();
path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).comp().url()); path = urlPath(((indexURLEntry) entry.getValue()).comp().url());
shorten = shortenPath(path); shorten = shortenPath(path);
// scan all subpaths of the url // scan all subpaths of the url
while (shorten != null) { while (shorten != null) {

@ -58,6 +58,7 @@ import de.anomic.http.httpHeader;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.cache.IResourceInfo; import de.anomic.plasma.cache.IResourceInfo;
@ -630,12 +631,12 @@ public class plasmaSnippetCache {
public void fetch(plasmaSearchResult acc, Set queryhashes, String urlmask, int fetchcount, long maxTime) { public void fetch(plasmaSearchResult acc, Set queryhashes, String urlmask, int fetchcount, long maxTime) {
// fetch snippets // fetch snippets
int i = 0; int i = 0;
plasmaCrawlLURLEntry urlentry; indexURLEntry urlentry;
String urlstring; String urlstring;
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) { while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) {
urlentry = acc.nextElement(); urlentry = acc.nextElement();
plasmaCrawlLURLEntry.Components comp = urlentry.comp(); indexURLEntry.Components comp = urlentry.comp();
if (comp.url().getHost().endsWith(".yacyh")) continue; if (comp.url().getHost().endsWith(".yacyh")) continue;
urlstring = comp.url().toNormalform(); urlstring = comp.url().toNormalform();
if ((urlstring.matches(urlmask)) && if ((urlstring.matches(urlmask)) &&

@ -132,9 +132,10 @@ import de.anomic.http.httpHeader;
import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry; import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry; import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
@ -1429,14 +1430,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
private plasmaParserDocument parseResource(plasmaSwitchboardQueue.Entry entry, String initiatorHash) throws InterruptedException, ParserException { private plasmaParserDocument parseResource(plasmaSwitchboardQueue.Entry entry, String initiatorHash) throws InterruptedException, ParserException {
plasmaParserDocument document = null;
// the mimetype of this entry // the mimetype of this entry
String mimeType = entry.getMimeType(); String mimeType = entry.getMimeType();
String charset = entry.getCharacterEncoding(); String charset = entry.getCharacterEncoding();
// the parser logger // the parser logger
serverLog parserLogger = parser.getLogger(); //serverLog parserLogger = parser.getLogger();
// parse the document // parse the document
return parseResource(entry.url(), mimeType, charset, entry.cacheFile()); return parseResource(entry.url(), mimeType, charset, entry.cacheFile());
@ -1497,7 +1497,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (document == null) return; if (document == null) return;
} catch (ParserException e) { } catch (ParserException e) {
this.log.logInfo("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage()); this.log.logInfo("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage());
addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorPeerHash, entry.anchorName(), e.getErrorCode(), new bitfield(indexURL.urlFlagLength)); addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorPeerHash, entry.anchorName(), e.getErrorCode(), new bitfield(indexRWIEntryOld.urlFlagLength));
if (document != null) { if (document != null) {
document.close(); document.close();
document = null; document = null;
@ -1574,7 +1574,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption(); checkInterruption();
// create a new loaded URL db entry // create a new loaded URL db entry
plasmaCrawlLURLEntry newEntry = urlPool.loadedURL.newEntry( indexURLEntry newEntry = urlPool.loadedURL.newEntry(
entry.url(), // URL entry.url(), // URL
docDescription, // document description docDescription, // document description
"", // author "", // author
@ -1660,7 +1660,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String language = indexEntryAttribute.language(entry.url()); String language = indexEntryAttribute.language(entry.url());
char doctype = indexEntryAttribute.docType(document.getMimeType()); char doctype = indexEntryAttribute.docType(document.getMimeType());
plasmaCrawlLURLEntry.Components comp = newEntry.comp(); indexURLEntry.Components comp = newEntry.comp();
int urlLength = comp.url().toNormalform().length(); int urlLength = comp.url().toNormalform().length();
int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()).length; int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()).length;
@ -1673,7 +1673,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String word = (String) wentry.getKey(); String word = (String) wentry.getKey();
wordStat = (plasmaCondenser.wordStatProp) wentry.getValue(); wordStat = (plasmaCondenser.wordStatProp) wentry.getValue();
String wordHash = indexEntryAttribute.word2hash(word); String wordHash = indexEntryAttribute.word2hash(word);
indexEntry wordIdxEntry = new indexURLEntry( indexRWIEntry wordIdxEntry = new indexRWIEntryOld(
urlHash, urlHash,
urlLength, urlComps, urlLength, urlComps,
wordStat.count, wordStat.count,
@ -1764,7 +1764,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
} else { } else {
log.logFine("Not Indexed Resource '" + entry.normalizedURLString() + "': process case=" + processCase); log.logFine("Not Indexed Resource '" + entry.normalizedURLString() + "': process case=" + processCase);
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new bitfield(indexURL.urlFlagLength)); addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new bitfield(indexRWIEntryOld.urlFlagLength));
} }
} catch (Exception ee) { } catch (Exception ee) {
if (ee instanceof InterruptedException) throw (InterruptedException)ee; if (ee instanceof InterruptedException) throw (InterruptedException)ee;
@ -1776,7 +1776,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) { if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
yacyClient.crawlReceipt(initiatorPeer, "crawl", "exception", ee.getMessage(), null, ""); yacyClient.crawlReceipt(initiatorPeer, "crawl", "exception", ee.getMessage(), null, "");
} }
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new bitfield(indexURL.urlFlagLength)); addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new bitfield(indexRWIEntryOld.urlFlagLength));
} }
} else { } else {
@ -1784,7 +1784,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption(); checkInterruption();
log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason); log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, noIndexReason, new bitfield(indexURL.urlFlagLength)); addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, noIndexReason, new bitfield(indexRWIEntryOld.urlFlagLength));
if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) { if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, ""); yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, "");
} }
@ -1991,7 +1991,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String lurl = (String) page.get("lurl"); String lurl = (String) page.get("lurl");
if ((lurl != null) && (lurl.length() != 0)) { if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key")); String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
plasmaCrawlLURLEntry entry = urlPool.loadedURL.newEntry(propStr); indexURLEntry entry = urlPool.loadedURL.newEntry(propStr);
urlPool.loadedURL.store(entry); urlPool.loadedURL.store(entry);
urlPool.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt? urlPool.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
urlPool.noticeURL.remove(entry.hash()); urlPool.noticeURL.remove(entry.hash());
@ -2070,7 +2070,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
prop.put("type_globalresults", acc.globalContributions); prop.put("type_globalresults", acc.globalContributions);
int i = 0; int i = 0;
int p; int p;
plasmaCrawlLURLEntry urlentry; indexURLEntry urlentry;
String urlstring, urlname, filename, urlhash; String urlstring, urlname, filename, urlhash;
String host, hash, address; String host, hash, address;
yacySeed seed; yacySeed seed;
@ -2081,7 +2081,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (targetTime < System.currentTimeMillis()) targetTime = System.currentTimeMillis() + 1000; if (targetTime < System.currentTimeMillis()) targetTime = System.currentTimeMillis() + 1000;
while ((acc.hasMoreElements()) && (i < query.wantedResults) && (System.currentTimeMillis() < targetTime)) { while ((acc.hasMoreElements()) && (i < query.wantedResults) && (System.currentTimeMillis() < targetTime)) {
urlentry = acc.nextElement(); urlentry = acc.nextElement();
plasmaCrawlLURLEntry.Components comp = urlentry.comp(); indexURLEntry.Components comp = urlentry.comp();
urlhash = urlentry.hash(); urlhash = urlentry.hash();
assert (urlhash != null); assert (urlhash != null);
assert (urlhash.length() == 12) : "urlhash = " + urlhash; assert (urlhash.length() == 12) : "urlhash = " + urlhash;
@ -2218,9 +2218,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// finally, delete the url entry // finally, delete the url entry
// determine the url string // determine the url string
plasmaCrawlLURLEntry entry = urlPool.loadedURL.load(urlhash, null); indexURLEntry entry = urlPool.loadedURL.load(urlhash, null);
if (entry == null) return 0; if (entry == null) return 0;
plasmaCrawlLURLEntry.Components comp = entry.comp(); indexURLEntry.Components comp = entry.comp();
if (comp.url() == null) return 0; if (comp.url() == null) return 0;
InputStream resourceContent = null; InputStream resourceContent = null;

@ -51,6 +51,8 @@ import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
@ -79,14 +81,14 @@ public class plasmaSwitchboardQueue {
private void initQueueStack() { private void initQueueStack() {
kelondroRow rowdef = new kelondroRow( kelondroRow rowdef = new kelondroRow(
"String url-" + indexURL.urlStringLength + ", " + // the url "String url-" + yacySeedDB.commonHashLength + ", " + // the url
"String refhash-" + indexURL.urlHashLength + ", " + // the url's referrer hash "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"Cardinal modifiedsince-11" + " {b64e}, " + // from ifModifiedSince "Cardinal modifiedsince-11" + " {b64e}, " + // from ifModifiedSince
"byte[] flags-1" + ", " + // flags "byte[] flags-1" + ", " + // flags
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator "String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"Cardinal depth-" + indexURL.urlCrawlDepthLength + " {b64e}, " + // the prefetch depth so far, starts at 0 "Cardinal depth-" + indexRWIEntryOld.urlCrawlDepthLength + " {b64e}, " + // the prefetch depth so far, starts at 0
"String profile-" + indexURL.urlCrawlProfileHandleLength + ", " + // the name of the prefetch profile handle "String profile-" + indexRWIEntryOld.urlCrawlProfileHandleLength + ", " + // the name of the prefetch profile handle
"String urldescr-" + indexURL.urlDescrLength); // "String urldescr-" + indexRWIEntryOld.urlDescrLength); //
sbQueueStack = kelondroStack.open(sbQueueStackPath, rowdef); sbQueueStack = kelondroStack.open(sbQueueStackPath, rowdef);
} }
@ -108,7 +110,7 @@ public class plasmaSwitchboardQueue {
kelondroBase64Order.enhancedCoder.encodeLong((entry.ifModifiedSince == null) ? 0 : entry.ifModifiedSince.getTime(), 11).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong((entry.ifModifiedSince == null) ? 0 : entry.ifModifiedSince.getTime(), 11).getBytes(),
new byte[]{entry.flags}, new byte[]{entry.flags},
(entry.initiator == null) ? indexURL.dummyHash.getBytes() : entry.initiator.getBytes(), (entry.initiator == null) ? indexURL.dummyHash.getBytes() : entry.initiator.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong((long) entry.depth, indexURL.urlCrawlDepthLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong((long) entry.depth, indexRWIEntryOld.urlCrawlDepthLength).getBytes(),
(entry.profileHandle == null) ? indexURL.dummyHash.getBytes() : entry.profileHandle.getBytes(), (entry.profileHandle == null) ? indexURL.dummyHash.getBytes() : entry.profileHandle.getBytes(),
(entry.anchorName == null) ? "-".getBytes("UTF-8") : entry.anchorName.getBytes("UTF-8") (entry.anchorName == null) ? "-".getBytes("UTF-8") : entry.anchorName.getBytes("UTF-8")
})); }));
@ -333,7 +335,7 @@ public class plasmaSwitchboardQueue {
public URL referrerURL() { public URL referrerURL() {
if (referrerURL == null) { if (referrerURL == null) {
if ((referrerHash == null) || (referrerHash.equals(indexURL.dummyHash))) return null; if ((referrerHash == null) || (referrerHash.equals(indexURL.dummyHash))) return null;
plasmaCrawlLURLEntry entry = lurls.load(referrerHash, null); indexURLEntry entry = lurls.load(referrerHash, null);
if (entry == null) referrerURL = null; else referrerURL = entry.comp().url(); if (entry == null) referrerURL = null; else referrerURL = entry.comp().url();
} }
return referrerURL; return referrerURL;

@ -48,6 +48,7 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL; import de.anomic.net.URL;
public class plasmaURLPool { public class plasmaURLPool {
@ -83,7 +84,7 @@ public class plasmaURLPool {
plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash); plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
if (ne != null) return ne.url(); if (ne != null) return ne.url();
} catch (IOException e) {} } catch (IOException e) {}
plasmaCrawlLURLEntry le = loadedURL.load(urlhash, null); indexURLEntry le = loadedURL.load(urlhash, null);
if (le != null) return le.comp().url(); if (le != null) return le.comp().url();
plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash); plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);
if (ee != null) return ee.url(); if (ee != null) return ee.url();

@ -40,10 +40,11 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.index.indexCollectionRI; import de.anomic.index.indexCollectionRI;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexContainerOrder; import de.anomic.index.indexContainerOrder;
import de.anomic.index.indexEntry; import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexRAMRI; import de.anomic.index.indexRAMRI;
import de.anomic.index.indexRI; import de.anomic.index.indexRI;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry; import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
@ -60,7 +61,7 @@ public final class plasmaWordIndex implements indexRI {
private static final String indexAssortmentClusterPath = "ACLUSTER"; private static final String indexAssortmentClusterPath = "ACLUSTER";
private static final int assortmentCount = 64; private static final int assortmentCount = 64;
private static final kelondroRow payloadrow = indexURLEntry.urlEntryRow; private static final kelondroRow payloadrow = indexRWIEntryOld.urlEntryRow;
private final File oldDatabaseRoot; private final File oldDatabaseRoot;
private final kelondroOrder indexOrder = new kelondroNaturalOrder(true); private final kelondroOrder indexOrder = new kelondroNaturalOrder(true);
@ -201,7 +202,7 @@ public final class plasmaWordIndex implements indexRI {
return new indexContainer(wordHash, payloadrow); return new indexContainer(wordHash, payloadrow);
} }
public indexContainer addEntry(String wordHash, indexEntry entry, long updateTime, boolean dhtInCase) { public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtInCase) {
// set dhtInCase depending on wordHash // set dhtInCase depending on wordHash
if ((!dhtInCase) && (yacyDHTAction.shallBeOwnWord(wordHash))) dhtInCase = true; if ((!dhtInCase) && (yacyDHTAction.shallBeOwnWord(wordHash))) dhtInCase = true;
@ -318,7 +319,7 @@ public final class plasmaWordIndex implements indexRI {
Iterator i = condenser.words(); Iterator i = condenser.words();
Map.Entry wentry; Map.Entry wentry;
String word; String word;
indexEntry ientry; indexRWIEntry ientry;
plasmaCondenser.wordStatProp wprop; plasmaCondenser.wordStatProp wprop;
String wordHash; String wordHash;
int urlLength = url.toString().length(); int urlLength = url.toString().length();
@ -330,7 +331,7 @@ public final class plasmaWordIndex implements indexRI {
wprop = (plasmaCondenser.wordStatProp) wentry.getValue(); wprop = (plasmaCondenser.wordStatProp) wentry.getValue();
// if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c); // if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
wordHash = indexEntryAttribute.word2hash(word); wordHash = indexEntryAttribute.word2hash(word);
ientry = new indexURLEntry(urlHash, ientry = new indexRWIEntryOld(urlHash,
urlLength, urlComps, (document == null) ? urlLength : document.getMainLongTitle().length(), urlLength, urlComps, (document == null) ? urlLength : document.getMainLongTitle().length(),
wprop.count, wprop.count,
condenser.RESULT_SIMI_WORDS, condenser.RESULT_SIMI_WORDS,
@ -685,11 +686,11 @@ public final class plasmaWordIndex implements indexRI {
// the combined container will fit, read the container // the combined container will fit, read the container
try { try {
Iterator entries = entity.elements(true); Iterator entries = entity.elements(true);
indexEntry entry; indexRWIEntry entry;
while (entries.hasNext()) { while (entries.hasNext()) {
entry = (indexEntry) entries.next(); entry = (indexRWIEntry) entries.next();
// System.out.println("ENTRY = " + entry.getUrlHash()); // System.out.println("ENTRY = " + entry.getUrlHash());
container.add(new indexEntry[]{entry}, System.currentTimeMillis()); container.add(new indexRWIEntry[]{entry}, System.currentTimeMillis());
} }
// we have read all elements, now delete the entity // we have read all elements, now delete the entity
entity.deleteComplete(); entity.deleteComplete();
@ -723,11 +724,11 @@ public final class plasmaWordIndex implements indexRI {
try { try {
Iterator entries = entity.elements(true); Iterator entries = entity.elements(true);
indexEntry entry; indexRWIEntry entry;
while (entries.hasNext()) { while (entries.hasNext()) {
entry = (indexEntry) entries.next(); entry = (indexRWIEntry) entries.next();
// System.out.println("ENTRY = " + entry.getUrlHash()); // System.out.println("ENTRY = " + entry.getUrlHash());
container.add(new indexEntry[] { entry }, System.currentTimeMillis()); container.add(new indexRWIEntry[] { entry }, System.currentTimeMillis());
} }
// we have read all elements, now delete the entity // we have read all elements, now delete the entity
entity.deleteComplete(); entity.deleteComplete();
@ -775,7 +776,7 @@ public final class plasmaWordIndex implements indexRI {
public void run() { public void run() {
serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread started"); serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread started");
indexContainer container = null; indexContainer container = null;
indexEntry entry = null; indexRWIEntry entry = null;
URL url = null; URL url = null;
HashSet urlHashs = new HashSet(); HashSet urlHashs = new HashSet();
try { try {
@ -787,9 +788,9 @@ public final class plasmaWordIndex implements indexRI {
wordHashNow = container.getWordHash(); wordHashNow = container.getWordHash();
while (containerIterator.hasNext() && run) { while (containerIterator.hasNext() && run) {
waiter(); waiter();
entry = (indexEntry) containerIterator.next(); entry = (indexRWIEntry) containerIterator.next();
// System.out.println("Wordhash: "+wordHash+" UrlHash: "+entry.getUrlHash()); // System.out.println("Wordhash: "+wordHash+" UrlHash: "+entry.getUrlHash());
plasmaCrawlLURLEntry ue = lurl.load(entry.urlHash(), null); indexURLEntry ue = lurl.load(entry.urlHash(), null);
if (ue == null) { if (ue == null) {
urlHashs.add(entry.urlHash()); urlHashs.add(entry.urlHash());
} else { } else {

@ -57,15 +57,15 @@ import java.io.IOException;
import java.util.Iterator; import java.util.Iterator;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry; import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroColumn; import de.anomic.kelondro.kelondroColumn;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroTree; import de.anomic.kelondro.kelondroTree;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB;
public final class plasmaWordIndexAssortment { public final class plasmaWordIndexAssortment {
@ -89,7 +89,7 @@ public final class plasmaWordIndexAssortment {
private kelondroRow bufferStructure(int assortmentCapacity) { private kelondroRow bufferStructure(int assortmentCapacity) {
kelondroColumn[] structure = new kelondroColumn[3 + assortmentCapacity]; kelondroColumn[] structure = new kelondroColumn[3 + assortmentCapacity];
structure[0] = new kelondroColumn("byte[] wordhash-" + indexEntryAttribute.wordHashLength); structure[0] = new kelondroColumn("byte[] wordhash-" + yacySeedDB.commonHashLength);
structure[1] = new kelondroColumn("Cardinal occ-4 {b256}"); structure[1] = new kelondroColumn("Cardinal occ-4 {b256}");
structure[2] = new kelondroColumn("Cardinal time-8 {b256}"); structure[2] = new kelondroColumn("Cardinal time-8 {b256}");
kelondroColumn p = new kelondroColumn("byte[] urlprops-" + payloadrow.objectsize()); kelondroColumn p = new kelondroColumn("byte[] urlprops-" + payloadrow.objectsize());
@ -98,7 +98,7 @@ public final class plasmaWordIndexAssortment {
} }
private int assortmentCapacity(int rowsize) { private int assortmentCapacity(int rowsize) {
return (rowsize - indexEntryAttribute.wordHashLength - 12) / payloadrow.objectsize(); return (rowsize - yacySeedDB.commonHashLength - 12) / payloadrow.objectsize();
} }
public plasmaWordIndexAssortment(File storagePath, kelondroRow payloadrow, int assortmentLength, int bufferkb, long preloadTime, serverLog log) throws IOException { public plasmaWordIndexAssortment(File storagePath, kelondroRow payloadrow, int assortmentLength, int bufferkb, long preloadTime, serverLog log) throws IOException {
@ -133,9 +133,9 @@ public final class plasmaWordIndexAssortment {
row.setCol(1, 1); row.setCol(1, 1);
row.setCol(2, newContainer.updated()); row.setCol(2, newContainer.updated());
Iterator entries = newContainer.entries(); Iterator entries = newContainer.entries();
indexEntry entry; indexRWIEntry entry;
for (int i = 0; i < assortmentLength; i++) { for (int i = 0; i < assortmentLength; i++) {
entry = (indexEntry) entries.next(); entry = (indexRWIEntry) entries.next();
row.setCol(3 + i, entry.toKelondroEntry().bytes()); row.setCol(3 + i, entry.toKelondroEntry().bytes());
} }
kelondroRow.Entry oldrow = null; kelondroRow.Entry oldrow = null;
@ -221,7 +221,7 @@ public final class plasmaWordIndexAssortment {
indexContainer container = new indexContainer(wordHash, payloadrow); indexContainer container = new indexContainer(wordHash, payloadrow);
int al = assortmentCapacity(row.objectsize()); int al = assortmentCapacity(row.objectsize());
for (int i = 0; i < al; i++) { for (int i = 0; i < al; i++) {
container.add(new indexEntry[] { new indexURLEntry(row.getColBytes(3 + i)) }, updateTime); container.add(new indexRWIEntry[] { new indexRWIEntryOld(row.getColBytes(3 + i)) }, updateTime);
} }
return container; return container;
} }

@ -54,7 +54,7 @@ import java.util.Set;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexContainerOrder; import de.anomic.index.indexContainerOrder;
import de.anomic.index.indexEntry; import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRI; import de.anomic.index.indexRI;
import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroMergeIterator; import de.anomic.kelondro.kelondroMergeIterator;
@ -168,7 +168,7 @@ public final class plasmaWordIndexAssortmentCluster implements indexRI {
c = new indexContainer(newContainer.getWordHash(), payloadrow); c = new indexContainer(newContainer.getWordHash(), payloadrow);
for (int k = 0; k < j; k++) { for (int k = 0; k < j; k++) {
if (i.hasNext()) { if (i.hasNext()) {
c.add((indexEntry) i.next(), newContainer.updated()); c.add((indexRWIEntry) i.next(), newContainer.updated());
} else { } else {
storeForced(c); storeForced(c);
return; return;
@ -178,7 +178,7 @@ public final class plasmaWordIndexAssortmentCluster implements indexRI {
} }
} }
public indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) { public indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
indexContainer container = new indexContainer(wordHash, payloadrow); indexContainer container = new indexContainer(wordHash, payloadrow);
container.add(newEntry); container.add(newEntry);
return addEntries(container, updateTime, dhtCase); return addEntries(container, updateTime, dhtCase);
@ -223,7 +223,7 @@ public final class plasmaWordIndexAssortmentCluster implements indexRI {
c = new indexContainer(newContainer.getWordHash(), payloadrow); c = new indexContainer(newContainer.getWordHash(), payloadrow);
for (int k = 0; k <= j; k++) { for (int k = 0; k <= j; k++) {
assert (i.hasNext()); assert (i.hasNext());
c.add((indexEntry) i.next(), newContainer.updated()); c.add((indexRWIEntry) i.next(), newContainer.updated());
} }
try { try {
storeForced(c); storeForced(c);
@ -306,9 +306,9 @@ public final class plasmaWordIndexAssortmentCluster implements indexRI {
if (buffer != null) { if (buffer != null) {
// sort out url hashes that shall be deleted // sort out url hashes that shall be deleted
Iterator bi = buffer.entries(); Iterator bi = buffer.entries();
indexEntry entry; indexRWIEntry entry;
while (bi.hasNext()) { while (bi.hasNext()) {
entry = (indexEntry) bi.next(); entry = (indexRWIEntry) bi.next();
if (urlHashes.remove(entry.urlHash())) bi.remove(); if (urlHashes.remove(entry.urlHash())) bi.remove();
} }
record.add(buffer, -1); record.add(buffer, -1);

@ -49,13 +49,13 @@ import java.io.IOException;
import java.util.Iterator; import java.util.Iterator;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry; import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURL; import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroTree; import de.anomic.kelondro.kelondroTree;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB;
public final class plasmaWordIndexFile { public final class plasmaWordIndexFile {
@ -91,7 +91,7 @@ public final class plasmaWordIndexFile {
long cacheSize = theLocation.length(); long cacheSize = theLocation.length();
if (cacheSize > 1048576) cacheSize = 1048576; if (cacheSize > 1048576) cacheSize = 1048576;
return kelondroTree.open(theLocation, cacheSize, 0, return kelondroTree.open(theLocation, cacheSize, 0,
new kelondroRow("byte[] urlhash-" + indexURL.urlHashLength + ", byte[] ba-" + (indexURLEntry.urlEntryRow.objectsize() - indexURL.urlHashLength))); new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength + ", byte[] ba-" + (indexRWIEntryOld.urlEntryRow.objectsize() - yacySeedDB.commonHashLength)));
} }
public static File wordHash2path(File databaseRoot, String hash) { public static File wordHash2path(File databaseRoot, String hash) {
@ -128,23 +128,23 @@ public final class plasmaWordIndexFile {
} catch (IOException e) {} } catch (IOException e) {}
} }
public indexEntry getEntry(String urlhash) throws IOException { public indexRWIEntry getEntry(String urlhash) throws IOException {
kelondroRow.Entry n = theIndex.get(urlhash.getBytes()); kelondroRow.Entry n = theIndex.get(urlhash.getBytes());
if (n == null) return null; if (n == null) return null;
return new indexURLEntry(n.getColString(0, null), n.getColString(1, null)); return new indexRWIEntryOld(n.getColString(0, null), n.getColString(1, null));
} }
public boolean contains(String urlhash) throws IOException { public boolean contains(String urlhash) throws IOException {
return (theIndex.get(urlhash.getBytes()) != null); return (theIndex.get(urlhash.getBytes()) != null);
} }
public boolean contains(indexEntry entry) throws IOException { public boolean contains(indexRWIEntry entry) throws IOException {
return (theIndex.get(entry.urlHash().getBytes()) != null); return (theIndex.get(entry.urlHash().getBytes()) != null);
} }
public boolean addEntry(indexEntry entry) throws IOException { public boolean addEntry(indexRWIEntry entry) throws IOException {
if (entry == null) return false; if (entry == null) return false;
indexEntry oldEntry = getEntry(entry.urlHash()); indexRWIEntry oldEntry = getEntry(entry.urlHash());
if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this entity if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this entity
return false; return false;
} }
@ -163,7 +163,7 @@ public final class plasmaWordIndexFile {
if (container != null) { if (container != null) {
Iterator i = container.entries(); Iterator i = container.entries();
while (i.hasNext()) { while (i.hasNext()) {
if (addEntry((indexEntry) i.next())) count++; if (addEntry((indexRWIEntry) i.next())) count++;
} }
} }
@ -228,7 +228,7 @@ public final class plasmaWordIndexFile {
public Object next() { public Object next() {
if (i == null) return null; if (i == null) return null;
kelondroRow.Entry n = (kelondroRow.Entry) i.next(); kelondroRow.Entry n = (kelondroRow.Entry) i.next();
return new indexURLEntry(n.getColString(0, null), n.getColString(1, null)); return new indexRWIEntryOld(n.getColString(0, null), n.getColString(1, null));
} }
public void remove() { public void remove() {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
@ -248,7 +248,7 @@ public final class plasmaWordIndexFile {
long timeout = (time == -1) ? Long.MAX_VALUE : System.currentTimeMillis() + time; long timeout = (time == -1) ? Long.MAX_VALUE : System.currentTimeMillis() + time;
try { try {
while ((i.hasNext()) && (System.currentTimeMillis() < timeout)) { while ((i.hasNext()) && (System.currentTimeMillis() < timeout)) {
addEntry((indexEntry) i.next()); addEntry((indexRWIEntry) i.next());
} }
} catch (kelondroException e) { } catch (kelondroException e) {
serverLog.logSevere("PLASMA", "plasmaWordIndexEntity.merge: " + e.getMessage()); serverLog.logSevere("PLASMA", "plasmaWordIndexEntity.merge: " + e.getMessage());

@ -51,7 +51,7 @@ import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry; import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRI; import de.anomic.index.indexRI;
import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
@ -235,10 +235,10 @@ public class plasmaWordIndexFileCluster implements indexRI {
if (exists(wordHash)) { if (exists(wordHash)) {
plasmaWordIndexFile entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10); plasmaWordIndexFile entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10);
indexContainer container = new indexContainer(wordHash, payloadrow); indexContainer container = new indexContainer(wordHash, payloadrow);
indexEntry entry; indexRWIEntry entry;
Iterator i = entity.elements(true); Iterator i = entity.elements(true);
while ((i.hasNext()) && (System.currentTimeMillis() < (start + maxTime))) { while ((i.hasNext()) && (System.currentTimeMillis() < (start + maxTime))) {
entry = (indexEntry) i.next(); entry = (indexRWIEntry) i.next();
if ((urlselection == null) || (urlselection.contains(entry.urlHash()))) container.add(entry); if ((urlselection == null) || (urlselection.contains(entry.urlHash()))) container.add(entry);
} }
return container; return container;
@ -302,7 +302,7 @@ public class plasmaWordIndexFileCluster implements indexRI {
} else return 0; } else return 0;
} }
public indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) { public indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
indexContainer container = new indexContainer(wordHash, payloadrow); indexContainer container = new indexContainer(wordHash, payloadrow);
container.add(newEntry); container.add(newEntry);
return addEntries(container, updateTime, dhtCase); return addEntries(container, updateTime, dhtCase);

@ -55,14 +55,14 @@ import java.util.TreeMap;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry; import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry; import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSearchTimingProfile; import de.anomic.plasma.plasmaSearchTimingProfile;
import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSnippetCache;
@ -491,33 +491,33 @@ public final class yacyClient {
//System.out.println("***result count " + results); //System.out.println("***result count " + results);
// create containers // create containers
final int words = wordhashes.length() / indexEntryAttribute.wordHashLength; final int words = wordhashes.length() / yacySeedDB.commonHashLength;
indexContainer[] container = new indexContainer[words]; indexContainer[] container = new indexContainer[words];
for (int i = 0; i < words; i++) { for (int i = 0; i < words; i++) {
container[i] = new indexContainer(wordhashes.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength), indexURLEntry.urlEntryRow); container[i] = new indexContainer(wordhashes.substring(i * yacySeedDB.commonHashLength, (i + 1) * yacySeedDB.commonHashLength), indexRWIEntryOld.urlEntryRow);
} }
// insert results to containers // insert results to containers
plasmaCrawlLURLEntry urlEntry; indexURLEntry urlEntry;
String[] urls = new String[results]; String[] urls = new String[results];
for (int n = 0; n < results; n++) { for (int n = 0; n < results; n++) {
// get one single search result // get one single search result
urlEntry = urlManager.newEntry((String) result.get("resource" + n)); urlEntry = urlManager.newEntry((String) result.get("resource" + n));
if (urlEntry == null) continue; if (urlEntry == null) continue;
assert (urlEntry.hash().length() == 12) : "urlEntry.hash() = " + urlEntry.hash(); assert (urlEntry.hash().length() == 12) : "urlEntry.hash() = " + urlEntry.hash();
plasmaCrawlLURLEntry.Components comp = urlEntry.comp(); indexURLEntry.Components comp = urlEntry.comp();
if (blacklist.isListed(plasmaURLPattern.BLACKLIST_SEARCH, comp.url())) continue; // block with backlist if (blacklist.isListed(plasmaURLPattern.BLACKLIST_SEARCH, comp.url())) continue; // block with backlist
urlManager.store(urlEntry); urlManager.store(urlEntry);
urlManager.stack(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2); urlManager.stack(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
// save the url entry // save the url entry
final indexEntry entry; final indexRWIEntry entry;
if (urlEntry.word() == null) { if (urlEntry.word() == null) {
// the old way to define words // the old way to define words
int urlLength = comp.url().toNormalform().length(); int urlLength = comp.url().toNormalform().length();
int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()).length; int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()).length;
entry = new indexURLEntry( entry = new indexRWIEntryOld(
urlEntry.hash(), urlEntry.hash(),
urlLength, urlLength,
urlComps, urlComps,
@ -545,7 +545,7 @@ public final class yacyClient {
} }
// add the url entry to the word indexes // add the url entry to the word indexes
for (int m = 0; m < words; m++) { for (int m = 0; m < words; m++) {
container[m].add(new indexEntry[]{entry}, System.currentTimeMillis()); container[m].add(new indexRWIEntry[]{entry}, System.currentTimeMillis());
} }
// store url hash for statistics // store url hash for statistics
urls[n] = urlEntry.hash(); urls[n] = urlEntry.hash();
@ -869,7 +869,7 @@ public final class yacyClient {
-er crawlt, Ergebnis erscheint aber unter falschem initiator -er crawlt, Ergebnis erscheint aber unter falschem initiator
*/ */
public static HashMap crawlReceipt(yacySeed targetSeed, String process, String result, String reason, plasmaCrawlLURLEntry entry, String wordhashes) { public static HashMap crawlReceipt(yacySeed targetSeed, String process, String result, String reason, indexURLEntry entry, String wordhashes) {
if (targetSeed == null) { return null; } if (targetSeed == null) { return null; }
if (yacyCore.seedDB.mySeed == null) { return null; } if (yacyCore.seedDB.mySeed == null) { return null; }
if (yacyCore.seedDB.mySeed == targetSeed) { return null; } if (yacyCore.seedDB.mySeed == targetSeed) { return null; }
@ -943,11 +943,11 @@ public final class yacyClient {
// check if we got all necessary urls in the urlCache (only for debugging) // check if we got all necessary urls in the urlCache (only for debugging)
Iterator eenum; Iterator eenum;
indexEntry entry; indexRWIEntry entry;
for (int i = 0; i < indexes.length; i++) { for (int i = 0; i < indexes.length; i++) {
eenum = indexes[i].entries(); eenum = indexes[i].entries();
while (eenum.hasNext()) { while (eenum.hasNext()) {
entry = (indexEntry) eenum.next(); entry = (indexRWIEntry) eenum.next();
if (urlCache.get(entry.urlHash()) == null) { if (urlCache.get(entry.urlHash()) == null) {
yacyCore.log.logFine("DEBUG transferIndex: to-send url hash '" + entry.urlHash() + "' is not contained in urlCache"); yacyCore.log.logFine("DEBUG transferIndex: to-send url hash '" + entry.urlHash() + "' is not contained in urlCache");
} }
@ -988,9 +988,9 @@ public final class yacyClient {
if (uhs.length == 0) { return resultObj; } // all url's known if (uhs.length == 0) { return resultObj; } // all url's known
// extract the urlCache from the result // extract the urlCache from the result
plasmaCrawlLURLEntry[] urls = new plasmaCrawlLURLEntry[uhs.length]; indexURLEntry[] urls = new indexURLEntry[uhs.length];
for (int i = 0; i < uhs.length; i++) { for (int i = 0; i < uhs.length; i++) {
urls[i] = (plasmaCrawlLURLEntry) urlCache.get(uhs[i]); urls[i] = (indexURLEntry) urlCache.get(uhs[i]);
if (urls[i] == null) { if (urls[i] == null) {
yacyCore.log.logFine("DEBUG transferIndex: requested url hash '" + uhs[i] + "', unknownURL='" + uhss + "'"); yacyCore.log.logFine("DEBUG transferIndex: requested url hash '" + uhs[i] + "', unknownURL='" + uhss + "'");
} }
@ -1051,11 +1051,11 @@ public final class yacyClient {
int indexcount = 0; int indexcount = 0;
final StringBuffer entrypost = new StringBuffer(indexes.length*73); final StringBuffer entrypost = new StringBuffer(indexes.length*73);
Iterator eenum; Iterator eenum;
indexEntry entry; indexRWIEntry entry;
for (int i = 0; i < indexes.length; i++) { for (int i = 0; i < indexes.length; i++) {
eenum = indexes[i].entries(); eenum = indexes[i].entries();
while (eenum.hasNext()) { while (eenum.hasNext()) {
entry = (indexEntry) eenum.next(); entry = (indexRWIEntry) eenum.next();
entrypost.append(indexes[i].getWordHash()) entrypost.append(indexes[i].getWordHash())
.append(entry.toPropertyForm(false)) .append(entry.toPropertyForm(false))
.append(serverCore.crlfString); .append(serverCore.crlfString);
@ -1099,7 +1099,7 @@ public final class yacyClient {
} }
} }
private static HashMap transferURL(yacySeed targetSeed, plasmaCrawlLURLEntry[] urls, boolean gzipBody, int timeout) { private static HashMap transferURL(yacySeed targetSeed, indexURLEntry[] urls, boolean gzipBody, int timeout) {
// this post a message to the remote message board // this post a message to the remote message board
final String address = targetSeed.getAddress(); final String address = targetSeed.getAddress();
if (address == null) { return null; } if (address == null) { return null; }

@ -71,10 +71,11 @@ import de.anomic.http.httpd;
import de.anomic.http.httpdFileHandler; import de.anomic.http.httpdFileHandler;
import de.anomic.http.httpdProxyHandler; import de.anomic.http.httpdProxyHandler;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry; import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL; import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry; import de.anomic.index.indexURLEntry;
import de.anomic.index.indexURLEntryOld;
import de.anomic.kelondro.kelondroDyn; import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroMap; import de.anomic.kelondro.kelondroMap;
@ -83,8 +84,6 @@ import de.anomic.kelondro.kelondroTree;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaCrawlLURLOldEntry;
import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURLPool; import de.anomic.plasma.plasmaURLPool;
@ -623,7 +622,7 @@ public final class yacy {
kelondroMScoreCluster hs = new kelondroMScoreCluster(); kelondroMScoreCluster hs = new kelondroMScoreCluster();
while (ef.hasMoreElements()) { while (ef.hasMoreElements()) {
f = (File) ef.nextElement(); f = (File) ef.nextElement();
h = f.getName().substring(0, indexURL.urlHashLength); h = f.getName().substring(0, yacySeedDB.commonHashLength);
hs.addScore(h, (int) f.length()); hs.addScore(h, (int) f.length());
} }
@ -740,12 +739,12 @@ public final class yacy {
// the combined container will fit, read the container // the combined container will fit, read the container
Iterator wordIdxEntries = wordIdxContainer.entries(); Iterator wordIdxEntries = wordIdxContainer.entries();
indexEntry iEntry; indexRWIEntry iEntry;
while (wordIdxEntries.hasNext()) { while (wordIdxEntries.hasNext()) {
iEntry = (indexEntry) wordIdxEntries.next(); iEntry = (indexRWIEntry) wordIdxEntries.next();
String urlHash = iEntry.urlHash(); String urlHash = iEntry.urlHash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try { if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
plasmaCrawlLURLEntry urlEntry = currentUrlDB.load(urlHash, null); indexURLEntry urlEntry = currentUrlDB.load(urlHash, null);
urlCounter++; urlCounter++;
minimizedUrlDB.store(urlEntry); minimizedUrlDB.store(urlEntry);
if (urlCounter % 500 == 0) { if (urlCounter % 500 == 0) {
@ -965,11 +964,11 @@ public final class yacy {
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
if (source.equals("lurl")) { if (source.equals("lurl")) {
Iterator eiter = pool.loadedURL.entries(true, false, null); Iterator eiter = pool.loadedURL.entries(true, false, null);
plasmaCrawlLURLEntry entry; indexURLEntry entry;
while (eiter.hasNext()) { while (eiter.hasNext()) {
try { try {
entry = (plasmaCrawlLURLEntry) eiter.next(); entry = (indexURLEntry) eiter.next();
plasmaCrawlLURLEntry.Components comp = entry.comp(); indexURLEntry.Components comp = entry.comp();
if ((entry != null) && (comp.url() != null)) doms.put(comp.url().getHost(), null); if ((entry != null) && (comp.url() != null)) doms.put(comp.url().getHost(), null);
} catch (Exception e) { } catch (Exception e) {
// here a MalformedURLException may occur // here a MalformedURLException may occur
@ -1077,10 +1076,10 @@ public final class yacy {
if (source.equals("lurl")) { if (source.equals("lurl")) {
Iterator eiter = pool.loadedURL.entries(true, false, null); Iterator eiter = pool.loadedURL.entries(true, false, null);
plasmaCrawlLURLEntry entry; indexURLEntry entry;
while (eiter.hasNext()) { while (eiter.hasNext()) {
entry = (plasmaCrawlLURLEntry) eiter.next(); entry = (indexURLEntry) eiter.next();
plasmaCrawlLURLEntry.Components comp = entry.comp(); indexURLEntry.Components comp = entry.comp();
if ((entry != null) && (comp.url() != null)) { if ((entry != null) && (comp.url() != null)) {
if (html) { if (html) {
bos.write(("<a href=\"" + comp.url().toNormalform() + "\">" + comp.descr() + "</a><br>").getBytes("UTF-8")); bos.write(("<a href=\"" + comp.url().toNormalform() + "\">" + comp.descr() + "</a><br>").getBytes("UTF-8"));
@ -1135,7 +1134,7 @@ public final class yacy {
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, true, 1000, true, 1000, true, 10000); plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, true, 1000, true, 1000, true, 10000);
kelondroTree oldindex = null; kelondroTree oldindex = null;
try { try {
oldindex = new kelondroTree(urlHash, 1000, -1, plasmaCrawlLURLOldEntry.rowdef); oldindex = new kelondroTree(urlHash, 1000, -1, indexURLEntryOld.rowdef);
} catch (IOException e) { } catch (IOException e) {
System.out.println("ERROR: CANNOT OPEN OLD INDEX: " + e.getMessage()); System.out.println("ERROR: CANNOT OPEN OLD INDEX: " + e.getMessage());
} }
@ -1145,9 +1144,9 @@ public final class yacy {
int tc = oldindex.size(), c = 0; int tc = oldindex.size(), c = 0;
Iterator eiter = oldindex.contentRows(-1); Iterator eiter = oldindex.contentRows(-1);
kelondroRow.Entry oldrow; kelondroRow.Entry oldrow;
plasmaCrawlLURLEntry oldentry; indexURLEntry oldentry;
plasmaCrawlLURLEntry newentry; indexURLEntry newentry;
plasmaCrawlLURLEntry.Components comp; indexURLEntry.Components comp;
byte[] dummymd5 = new byte[0]; byte[] dummymd5 = new byte[0];
while (eiter.hasNext()) { while (eiter.hasNext()) {
try { try {
@ -1158,7 +1157,7 @@ public final class yacy {
oldrow = null; oldrow = null;
} }
if (oldrow != null) try { if (oldrow != null) try {
oldentry = new plasmaCrawlLURLOldEntry(oldrow, null); oldentry = new indexURLEntryOld(oldrow, null);
comp = oldentry.comp(); comp = oldentry.comp();
newentry = pool.loadedURL.newEntry( newentry = pool.loadedURL.newEntry(
comp.url(), comp.url(),
@ -1236,7 +1235,7 @@ public final class yacy {
WordIndex = new plasmaWordIndex(homeDBroot, indexRoot, true, 8*1024*1024, 3000, log, sps.getConfigBool("useCollectionIndex", false)); WordIndex = new plasmaWordIndex(homeDBroot, indexRoot, true, 8*1024*1024, 3000, log, sps.getConfigBool("useCollectionIndex", false));
indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false); indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false);
} else if (resource.equals("assortments")) { } else if (resource.equals("assortments")) {
plasmaWordIndexAssortmentCluster assortmentCluster = new plasmaWordIndexAssortmentCluster(new File(homeDBroot, "ACLUSTER"), 64, indexURLEntry.urlEntryRow, 16*1024*1024, 3000, log); plasmaWordIndexAssortmentCluster assortmentCluster = new plasmaWordIndexAssortmentCluster(new File(homeDBroot, "ACLUSTER"), 64, indexRWIEntryOld.urlEntryRow, 16*1024*1024, 3000, log);
indexContainerIterator = assortmentCluster.wordContainers(wordChunkStartHash, true, false); indexContainerIterator = assortmentCluster.wordContainers(wordChunkStartHash, true, false);
} /*else if (resource.startsWith("assortment")) { } /*else if (resource.startsWith("assortment")) {
int a = Integer.parseInt(resource.substring(10)); int a = Integer.parseInt(resource.substring(10));

Loading…
Cancel
Save