From bb7d4b5d5e68ad0984fc3fe098b340dcfa107b7a Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 8 Nov 2006 16:17:47 +0000 Subject: [PATCH] refactoring to prepare new RWI entry object - moved all url and index(RWI) entries to index package - better naming to distinguish RWI entries and URL entries git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2937 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Bookmarks.java | 6 +- htroot/IndexControl_p.java | 34 +- htroot/IndexCreate_p.java | 5 +- htroot/IndexMonitor.java | 120 +++++-- htroot/ViewFile.java | 6 +- htroot/htdocsdefault/dir.java | 4 +- htroot/yacy/crawlOrder.java | 4 +- htroot/yacy/crawlReceipt.java | 9 +- htroot/yacy/search.java | 6 +- htroot/yacy/transferRWI.java | 8 +- htroot/yacy/transferURL.java | 6 +- htroot/yacysearch.java | 6 +- source/de/anomic/index/indexCachedRI.java | 2 +- source/de/anomic/index/indexCollectionRI.java | 2 +- source/de/anomic/index/indexContainer.java | 42 +-- .../de/anomic/index/indexEntryAttribute.java | 6 +- source/de/anomic/index/indexRAMRI.java | 14 +- source/de/anomic/index/indexRI.java | 2 +- .../{indexEntry.java => indexRWIEntry.java} | 18 +- source/de/anomic/index/indexRWIEntryOld.java | 323 ++++++++++++++++++ source/de/anomic/index/indexURL.java | 33 +- .../indexURLEntry.java} | 17 +- .../indexURLEntryNew.java} | 33 +- .../indexURLEntryOld.java} | 62 ++-- .../plasma/crawler/AbstractCrawlWorker.java | 3 +- .../plasma/dbImport/AssortmentImporter.java | 4 +- .../plasma/dbImport/plasmaDbImporter.java | 10 +- .../de/anomic/plasma/plasmaCrawlBalancer.java | 4 +- source/de/anomic/plasma/plasmaCrawlEURL.java | 36 +- source/de/anomic/plasma/plasmaCrawlLURL.java | 175 +++------- source/de/anomic/plasma/plasmaCrawlNURL.java | 40 +-- .../de/anomic/plasma/plasmaCrawlProfile.java | 8 +- .../de/anomic/plasma/plasmaCrawlStacker.java | 18 +- source/de/anomic/plasma/plasmaDHTChunk.java | 15 +- source/de/anomic/plasma/plasmaHTCache.java | 5 +- .../de/anomic/plasma/plasmaSearchEvent.java | 13 +- .../de/anomic/plasma/plasmaSearchImages.java | 3 +- .../anomic/plasma/plasmaSearchPreOrder.java | 24 +- .../de/anomic/plasma/plasmaSearchQuery.java | 9 +- .../plasma/plasmaSearchRankingProfile.java | 9 +- .../de/anomic/plasma/plasmaSearchResult.java | 17 +- .../de/anomic/plasma/plasmaSnippetCache.java | 5 +- .../de/anomic/plasma/plasmaSwitchboard.java | 32 +- .../anomic/plasma/plasmaSwitchboardQueue.java | 22 +- source/de/anomic/plasma/plasmaURLPool.java | 3 +- source/de/anomic/plasma/plasmaWordIndex.java | 29 +- .../plasma/plasmaWordIndexAssortment.java | 16 +- .../plasmaWordIndexAssortmentCluster.java | 12 +- .../de/anomic/plasma/plasmaWordIndexFile.java | 24 +- .../plasma/plasmaWordIndexFileCluster.java | 8 +- source/de/anomic/yacy/yacyClient.java | 34 +- source/yacy.java | 39 ++- 52 files changed, 847 insertions(+), 538 deletions(-) rename source/de/anomic/index/{indexEntry.java => indexRWIEntry.java} (79%) create mode 100644 source/de/anomic/index/indexRWIEntryOld.java rename source/de/anomic/{plasma/plasmaCrawlLURLEntry.java => index/indexURLEntry.java} (90%) rename source/de/anomic/{plasma/plasmaCrawlLURLNewEntry.java => index/indexURLEntryNew.java} (92%) rename source/de/anomic/{plasma/plasmaCrawlLURLOldEntry.java => index/indexURLEntryOld.java} (84%) diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index 111169f3a..42a36bc6d 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -55,8 +55,8 @@ import de.anomic.data.bookmarksDB; import de.anomic.data.listManager; import de.anomic.data.bookmarksDB.Tag; import de.anomic.http.httpHeader; +import de.anomic.index.indexURLEntry; import de.anomic.net.URL; -import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; @@ -147,10 +147,10 @@ public class Bookmarks { bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.getBookmark(urlHash); if (bookmark == null) { // try to get the bookmark from the LURL database - plasmaCrawlLURLEntry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null); + indexURLEntry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null); plasmaParserDocument document = null; if (urlentry != null) { - plasmaCrawlLURLEntry.Components comp = urlentry.comp(); + indexURLEntry.Components comp = urlentry.comp(); document = switchboard.snippetCache.retrieveDocument(comp.url(), true); prop.put("mode_edit", 0); // create mode prop.put("mode_url", comp.url().toNormalform()); diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 73d44636f..86f3b8561 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -57,11 +57,11 @@ import java.util.TreeMap; import de.anomic.http.httpHeader; import de.anomic.index.indexContainer; -import de.anomic.index.indexEntry; +import de.anomic.index.indexRWIEntry; import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexURL; +import de.anomic.index.indexURLEntry; import de.anomic.net.URL; -import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.urlPattern.plasmaURLPattern; @@ -161,7 +161,7 @@ public class IndexControl_p { int i = 0; urlx = new String[index.size()]; while (en.hasNext()) { - urlx[i++] = ((indexEntry) en.next()).urlHash(); + urlx[i++] = ((indexRWIEntry) en.next()).urlHash(); } index = null; } @@ -218,7 +218,7 @@ public class IndexControl_p { } if (post.containsKey("urlhashdelete")) { - plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null); + indexURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null); if (entry == null) { prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); } else { @@ -263,10 +263,10 @@ public class IndexControl_p { Iterator urlIter = index.entries(); HashMap knownURLs = new HashMap(); HashSet unknownURLEntries = new HashSet(); - indexEntry iEntry; - plasmaCrawlLURLEntry lurl; + indexRWIEntry iEntry; + indexURLEntry lurl; while (urlIter.hasNext()) { - iEntry = (indexEntry) urlIter.next(); + iEntry = (indexRWIEntry) urlIter.next(); lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null); if (lurl == null) { unknownURLEntries.add(iEntry.urlHash()); @@ -320,7 +320,7 @@ public class IndexControl_p { URL url = new URL(urlstring); urlhash = indexURL.urlHash(url); prop.put("urlhash", urlhash); - plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null); + indexURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null); if (entry == null) { prop.put("urlstring", "unknown url: " + urlstring); prop.put("urlhash", ""); @@ -334,7 +334,7 @@ public class IndexControl_p { } if (post.containsKey("urlhashsearch")) { - plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null); + indexURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null); if (entry == null) { prop.put("result", "No Entry for URL hash " + urlhash); } else { @@ -348,12 +348,12 @@ public class IndexControl_p { try { final Iterator entryIt = switchboard.urlPool.loadedURL.entries(true, true, urlhash); StringBuffer result = new StringBuffer("Sequential List of URL-Hashes:
"); - plasmaCrawlLURLEntry entry; + indexURLEntry entry; int i = 0; int rows = 0, cols = 0; prop.put("urlhashsimilar", 1); while (entryIt.hasNext() && i < 256) { - entry = (plasmaCrawlLURLEntry) entryIt.next(); + entry = (indexURLEntry) entryIt.next(); prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", entry.hash()); cols++; if (cols==8) { @@ -400,16 +400,16 @@ public class IndexControl_p { return prop; } - public static serverObjects genUrlProfile(plasmaSwitchboard switchboard, plasmaCrawlLURLEntry entry, String urlhash) { + public static serverObjects genUrlProfile(plasmaSwitchboard switchboard, indexURLEntry entry, String urlhash) { serverObjects prop = new serverObjects(); if (entry == null) { prop.put("genUrlProfile", 1); prop.put("genUrlProfile_urlhash", urlhash); return prop; } - plasmaCrawlLURLEntry.Components comp = entry.comp(); + indexURLEntry.Components comp = entry.comp(); String referrer = null; - plasmaCrawlLURLEntry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null); + indexURLEntry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null); if (le == null) { referrer = ""; } else { @@ -453,11 +453,11 @@ public class IndexControl_p { int i = 0; final TreeMap tm = new TreeMap(); - indexEntry xi; + indexRWIEntry xi; while (en.hasNext()) { - xi = (indexEntry) en.next(); + xi = (indexRWIEntry) en.next(); uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())}; - plasmaCrawlLURLEntry le = switchboard.urlPool.loadedURL.load(uh[0], null); + indexURLEntry le = switchboard.urlPool.loadedURL.load(uh[0], null); if (le == null) { tm.put(uh[0], uh); } else { diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index bca1de11f..5bd0fe0fa 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -60,6 +60,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterWriter; import de.anomic.http.httpHeader; import de.anomic.index.indexURL; +import de.anomic.index.indexRWIEntryOld; import de.anomic.net.URL; import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlProfile; @@ -204,7 +205,7 @@ public class IndexCreate_p { prop.put("error_reasonString", reasonString); plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, - crawlingStartURL.getHost(), reasonString, new bitfield(indexURL.urlFlagLength)); + crawlingStartURL.getHost(), reasonString, new bitfield(indexRWIEntryOld.urlFlagLength)); ee.store(); switchboard.urlPool.errorURL.stackPushEntry(ee); } @@ -282,7 +283,7 @@ public class IndexCreate_p { c++; } else { plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, - (String) e.getValue(), rejectReason, new bitfield(indexURL.urlFlagLength)); + (String) e.getValue(), rejectReason, new bitfield(indexRWIEntryOld.urlFlagLength)); ee.store(); switchboard.urlPool.errorURL.stackPushEntry(ee); } diff --git a/htroot/IndexMonitor.java b/htroot/IndexMonitor.java index 3a015f938..97568020e 100644 --- a/htroot/IndexMonitor.java +++ b/htroot/IndexMonitor.java @@ -43,22 +43,33 @@ // javac -classpath .:../Classes Settings_p.java // if the shell's current path is HTROOT +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.Locale; + import de.anomic.http.httpHeader; +import de.anomic.index.indexURLEntry; +import de.anomic.net.URL; +import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; +import de.anomic.server.logging.serverLog; +import de.anomic.tools.nxTools; +import de.anomic.yacy.yacyCore; +import de.anomic.yacy.yacySeed; public class IndexMonitor { public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { // return variable that accumulates replacements - plasmaSwitchboard switchboard = (plasmaSwitchboard) env; + plasmaSwitchboard sb = (plasmaSwitchboard) env; serverObjects prop = new serverObjects(); - int showIndexedCount = 40; - boolean si = false; - boolean se = false; + int lines = 40; + boolean showInit = false; + boolean showExec = false; if (post == null) { @@ -67,20 +78,20 @@ public class IndexMonitor { } // find process number - int process; + int tabletype; try { - process = Integer.parseInt(post.get("process", "0")); + tabletype = Integer.parseInt(post.get("process", "0")); } catch (NumberFormatException e) { - process = 0; + tabletype = 0; } // check if authorization is needed and/or given - if (((process > 0) && (process < 6)) || + if (((tabletype > 0) && (tabletype < 6)) || (post.containsKey("clearlist")) || (post.containsKey("deleteentry"))) { String authorization = ((String) header.get("Authorization", "xxxxxx")); if (authorization.length() != 0) { - if (! switchboard.verifyAuthentication(header, true)){ + if (! sb.verifyAuthentication(header, true)){ // force log-in (again, because wrong password was given) prop.put("AUTHENTICATE", "admin log-in"); return prop; @@ -94,33 +105,102 @@ public class IndexMonitor { // custom number of lines if (post.containsKey("count")) { - showIndexedCount = Integer.parseInt(post.get("count", "40")); + lines = Integer.parseInt(post.get("count", "40")); } // do the commands - if (post.containsKey("clearlist")) switchboard.urlPool.loadedURL.clearStack(process); + if (post.containsKey("clearlist")) sb.urlPool.loadedURL.clearStack(tabletype); if (post.containsKey("deleteentry")) { String hash = post.get("hash", null); if (hash != null) { // delete from database - switchboard.urlPool.loadedURL.remove(hash); + sb.urlPool.loadedURL.remove(hash); } } if (post.containsKey("moreIndexed")) { - showIndexedCount = Integer.parseInt(post.get("showIndexed", "40")); + lines = Integer.parseInt(post.get("showIndexed", "40")); } - if (post.get("si") != null) si = true; - if (post.get("se") != null) se = true; + if (post.get("si") != null) showInit = true; + if (post.get("se") != null) showExec = true; // create table - if (process == 0) { + if (tabletype == 0) { prop.put("table", 2); + } else if (sb.urlPool.loadedURL.getStackSize(tabletype) == 0) { + prop.put("table", 0); } else { - prop.putAll(switchboard.urlPool.loadedURL.genTableProps(process, showIndexedCount, si, se, "unknown", null, "IndexMonitor.html", true)); + prop.put("table", 1); + if (lines > sb.urlPool.loadedURL.getStackSize(tabletype)) lines = sb.urlPool.loadedURL.getStackSize(tabletype); + if (lines == sb.urlPool.loadedURL.getStackSize(tabletype)) { + prop.put("table_size", 0); + } else { + prop.put("table_size", 1); + prop.put("table_size_count", lines); + } + prop.put("table_size_all", sb.urlPool.loadedURL.getStackSize(tabletype)); + prop.put("table_feedbackpage", "IndexMonitor.html"); + prop.put("table_tabletype", tabletype); + prop.put("table_showInit", (showInit) ? 1 : 0); + prop.put("table_showExec", (showExec) ? 1 : 0); + + boolean dark = true; + String urlHash, initiatorHash, executorHash; + String cachepath, urlstr, urltxt; + yacySeed initiatorSeed, executorSeed; + indexURLEntry urle; + + // needed for getCachePath(url) + final plasmaHTCache cacheManager = sb.getCacheManager(); + + int i, cnt = 0; + for (i = sb.urlPool.loadedURL.getStackSize(tabletype) - 1; i >= (sb.urlPool.loadedURL.getStackSize(tabletype) - lines); i--) { + initiatorHash = sb.urlPool.loadedURL.getInitiatorHash(tabletype, i); + executorHash = sb.urlPool.loadedURL.getExecutorHash(tabletype, i); +// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps initiatorHash=" + initiatorHash + " executorHash=" + executorHash); + urlHash = sb.urlPool.loadedURL.getUrlHash(tabletype, i); +// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash); + try { + urle = sb.urlPool.loadedURL.load(urlHash, null); + indexURLEntry.Components comp = urle.comp(); +// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString()); + initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash); + executorSeed = yacyCore.seedDB.getConnected(executorHash); + + urlstr = comp.url().toNormalform(); + urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL + cachepath = cacheManager.getCachePath(new URL(urlstr)).toString().replace('\\', '/').substring(cacheManager.cachePath.toString().length() + 1); + + prop.put("table_indexed_" + cnt + "_dark", (dark) ? 1 : 0); + prop.put("table_indexed_" + cnt + "_feedbackpage", "IndexMonitor.html"); + prop.put("table_indexed_" + cnt + "_tabletype", tabletype); + prop.put("table_indexed_" + cnt + "_urlhash", urlHash); + prop.put("table_indexed_" + cnt + "_showInit", (showInit) ? 1 : 0); + prop.put("table_indexed_" + cnt + "_showInit_initiatorSeed", (initiatorSeed == null) ? "unknown" : initiatorSeed.getName()); + prop.put("table_indexed_" + cnt + "_showExec", (showExec) ? 1 : 0); + prop.put("table_indexed_" + cnt + "_showExec_executorSeed", (executorSeed == null) ? "unknown" : executorSeed.getName()); + prop.put("table_indexed_" + cnt + "_moddate", daydate(urle.moddate())); + prop.put("table_indexed_" + cnt + "_wordcount", urle.wordCount()); + prop.put("table_indexed_" + cnt + "_urldescr", comp.descr()); + prop.put("table_indexed_" + cnt + "_url", (cachepath == null) ? "-not-cached-" : "" + urltxt + ""); + dark = !dark; + cnt++; + } catch (Exception e) { + serverLog.logSevere("PLASMA", "genTableProps", e); + } + } + prop.put("table_indexed", cnt); } - prop.put("process", process); - // return rewrite properties - return prop; + prop.put("process", tabletype); + // return rewrite properties + return prop; } + private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); + private static String daydate(Date date) { + if (date == null) { + return ""; + } else { + return dayFormatter.format(date); + } + } } diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 7302d7465..47d428683 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -54,7 +54,7 @@ import java.util.Enumeration; import de.anomic.data.wikiCode; import de.anomic.http.httpHeader; import de.anomic.http.httpc; -import de.anomic.plasma.plasmaCrawlLURLEntry; +import de.anomic.index.indexURLEntry; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaSwitchboard; @@ -106,7 +106,7 @@ public class ViewFile { String viewMode = post.get("viewMode","sentences"); // getting the urlEntry that belongs to the url hash - plasmaCrawlLURLEntry urlEntry = null; + indexURLEntry urlEntry = null; urlEntry = sb.urlPool.loadedURL.load(urlHash, null); if (urlEntry == null) { prop.put("error",2); @@ -115,7 +115,7 @@ public class ViewFile { } // gettin the url that belongs to the entry - plasmaCrawlLURLEntry.Components comp = urlEntry.comp(); + indexURLEntry.Components comp = urlEntry.comp(); if ((comp == null) || (comp.url() == null)) { prop.put("error",3); prop.put("viewMode",VIEW_MODE_NO_TEXT); diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java index a4db30398..8e347ddde 100644 --- a/htroot/htdocsdefault/dir.java +++ b/htroot/htdocsdefault/dir.java @@ -61,10 +61,10 @@ import de.anomic.data.userDB; import de.anomic.http.httpHeader; import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexURL; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.net.URL; import de.anomic.plasma.plasmaCondenser; -import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverCodings; import de.anomic.server.serverCore; @@ -362,7 +362,7 @@ public class dir { try { final URL url = new URL(urlstring); final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes())); - final plasmaCrawlLURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry( + final indexURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry( url, "YaCyShare: " + descr, yacyCore.seedDB.mySeed.getName(), diff --git a/htroot/yacy/crawlOrder.java b/htroot/yacy/crawlOrder.java index b60545195..fad5b9b39 100644 --- a/htroot/yacy/crawlOrder.java +++ b/htroot/yacy/crawlOrder.java @@ -50,8 +50,8 @@ import java.util.Date; import de.anomic.http.httpHeader; import de.anomic.index.indexURL; +import de.anomic.index.indexURLEntry; import de.anomic.net.URL; -import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -249,7 +249,7 @@ public final class crawlOrder { // case where we have already the url loaded; reason = reasonString; // send lurl-Entry as response - plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null); + indexURLEntry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null); if (entry == null) { response = "rejected"; lurl = ""; diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index 1973adbb9..88a0d10eb 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -50,8 +50,9 @@ import java.io.IOException; import de.anomic.http.httpHeader; import de.anomic.index.indexURL; +import de.anomic.index.indexRWIEntryOld; +import de.anomic.index.indexURLEntry; import de.anomic.plasma.plasmaCrawlEURL; -import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; @@ -124,12 +125,12 @@ public final class crawlReceipt { prop.put("delay", "3600"); } else if (result.equals("fill")) { // generating a new loaded URL entry - plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr); + indexURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr); if (entry == null) { log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) for hash " + receivedUrlhash + " from peer " + iam + "\n\tURL properties: "+ propStr); } else { - plasmaCrawlLURLEntry.Components comp = entry.comp(); + indexURLEntry.Components comp = entry.comp(); if (comp.url() == null) { log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + receivedUrlhash + " from peer " + iam + "\n\tURL properties: "+ propStr); @@ -156,7 +157,7 @@ public final class crawlReceipt { } else { try { plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash); - plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(indexURL.urlFlagLength)); + plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(indexRWIEntryOld.urlFlagLength)); ee.store(); switchboard.urlPool.errorURL.stackPushEntry(ee); switchboard.urlPool.noticeURL.remove(receivedUrlhash); diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index d09819441..7ae96f71b 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -54,7 +54,7 @@ import java.util.Set; import de.anomic.http.httpHeader; import de.anomic.index.indexContainer; import de.anomic.index.indexURL; -import de.anomic.plasma.plasmaCrawlLURLEntry; +import de.anomic.index.indexURLEntry; import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchRankingProfile; @@ -249,10 +249,10 @@ public final class search { StringBuffer links = new StringBuffer(); String resource = ""; //plasmaIndexEntry pie; - plasmaCrawlLURLEntry urlentry; + indexURLEntry urlentry; plasmaSnippetCache.Snippet snippet; while ((acc.hasMoreElements()) && (i < squery.wantedResults)) { - urlentry = (plasmaCrawlLURLEntry) acc.nextElement(); + urlentry = (indexURLEntry) acc.nextElement(); if (includesnippet) { snippet = sb.snippetCache.retrieveSnippet(urlentry.comp().url(), squery.queryHashes, false, 260, 1000); } else { diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index fb5b17266..2fa8ea4fd 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -51,8 +51,8 @@ import java.util.Iterator; import java.util.LinkedList; import de.anomic.http.httpHeader; -import de.anomic.index.indexEntry; -import de.anomic.index.indexURLEntry; +import de.anomic.index.indexRWIEntry; +import de.anomic.index.indexRWIEntryOld; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.serverCore; @@ -146,7 +146,7 @@ public final class transferRWI { int p; String wordHash; String urlHash; - indexEntry iEntry; + indexRWIEntry iEntry; int wordhashesSize = v.size(); final HashSet unknownURL = new HashSet(); final HashSet knownURL = new HashSet(); @@ -162,7 +162,7 @@ public final class transferRWI { if (p > 0) { wordHash = estring.substring(0, p); wordhashes[received] = wordHash; - iEntry = new indexURLEntry(estring.substring(p)); + iEntry = new indexRWIEntryOld(estring.substring(p)); urlHash = iEntry.urlHash(); if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.hashInBlacklistedCache(plasmaURLPattern.BLACKLIST_DHT, urlHash))) { int deleted = sb.wordIndex.tryRemoveURLs(urlHash); diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index a09388ff7..6984bf679 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -48,7 +48,7 @@ import java.io.IOException; import de.anomic.http.httpHeader; -import de.anomic.plasma.plasmaCrawlLURLEntry; +import de.anomic.index.indexURLEntry; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.serverCore; @@ -90,7 +90,7 @@ public final class transferURL { final int sizeBefore = sb.urlPool.loadedURL.size(); // read the urls from the other properties and store String urls; - plasmaCrawlLURLEntry lEntry; + indexURLEntry lEntry; for (int i = 0; i < urlc; i++) { serverCore.checkInterruption(); urls = (String) post.get("url" + i); @@ -102,7 +102,7 @@ public final class transferURL { yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls); // TODO: should we send back an error message??? } else { - plasmaCrawlLURLEntry.Components comp = lEntry.comp(); + indexURLEntry.Components comp = lEntry.comp(); if (comp.url() == null) { yacyCore.log.logWarning("transferURL: received invalid URL (url null) from peer " + otherPeerName + "\n\tURL Property: " + urls); // TODO: should we send back an error message??? diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 1721351fd..7c1034043 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -54,10 +54,10 @@ import java.util.TreeSet; import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.http.httpHeader; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroMSetTools; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.net.URL; -import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaSearchImages; import de.anomic.plasma.plasmaSearchPreOrder; @@ -189,9 +189,9 @@ public class yacysearch { return prop; } final String recommendHash = post.get("recommendref", ""); // urlhash - plasmaCrawlLURLEntry urlentry = sb.urlPool.loadedURL.load(recommendHash, null); + indexURLEntry urlentry = sb.urlPool.loadedURL.load(recommendHash, null); if (urlentry != null) { - plasmaCrawlLURLEntry.Components comp = urlentry.comp(); + indexURLEntry.Components comp = urlentry.comp(); plasmaParserDocument document; document = sb.snippetCache.retrieveDocument(comp.url(), true); if (document != null) { diff --git a/source/de/anomic/index/indexCachedRI.java b/source/de/anomic/index/indexCachedRI.java index 90b6748e5..1fdf34efb 100644 --- a/source/de/anomic/index/indexCachedRI.java +++ b/source/de/anomic/index/indexCachedRI.java @@ -91,7 +91,7 @@ public class indexCachedRI implements indexRI { return new indexContainer(wordHash, payloadrow); } - public indexContainer addEntry(String wordHash, indexEntry entry, long updateTime, boolean intern) { + public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean intern) { // add the entry if (intern) { riIntern.addEntry(wordHash, entry, updateTime, true); diff --git a/source/de/anomic/index/indexCollectionRI.java b/source/de/anomic/index/indexCollectionRI.java index c0e9218ed..6db01166f 100644 --- a/source/de/anomic/index/indexCollectionRI.java +++ b/source/de/anomic/index/indexCollectionRI.java @@ -152,7 +152,7 @@ public class indexCollectionRI implements indexRI { } } - public synchronized indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) { + public synchronized indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) { indexContainer container = new indexContainer(wordHash, collectionIndex.payloadRow()); container.add(newEntry); return addEntries(container, updateTime, dhtCase); diff --git a/source/de/anomic/index/indexContainer.java b/source/de/anomic/index/indexContainer.java index 2608a3c39..1fc7afbc9 100644 --- a/source/de/anomic/index/indexContainer.java +++ b/source/de/anomic/index/indexContainer.java @@ -81,18 +81,18 @@ public class indexContainer extends kelondroRowSet { return wordHash; } - public int add(indexEntry entry) { + public int add(indexRWIEntry entry) { this.addUnique(entry.toKelondroEntry()); return 1; } - public int add(indexEntry entry, long updateTime) { + public int add(indexRWIEntry entry, long updateTime) { this.add(entry); this.lastTimeWrote = updateTime; return 1; } - public int add(indexEntry[] entries, long updateTime) { + public int add(indexRWIEntry[] entries, long updateTime) { for (int i = 0; i < entries.length; i++) this.add(entries[i], updateTime); return entries.length; } @@ -106,7 +106,7 @@ public class indexContainer extends kelondroRowSet { Iterator i = c.entries(); while (i.hasNext()) { try { - if (addi((indexEntry) i.next())) x++; + if (addi((indexRWIEntry) i.next())) x++; } catch (ConcurrentModificationException e) { e.printStackTrace(); } @@ -117,13 +117,13 @@ public class indexContainer extends kelondroRowSet { return x; } - private boolean addi(indexEntry entry) { + private boolean addi(indexRWIEntry entry) { // returns true if the new entry was added, false if it already existed kelondroRow.Entry oldEntryRow = this.put(entry.toKelondroEntry()); if (oldEntryRow == null) { return true; } else { - indexEntry oldEntry = new indexURLEntry(oldEntryRow); // FIXME: see if cloning is necessary + indexRWIEntry oldEntry = new indexRWIEntryOld(oldEntryRow); // FIXME: see if cloning is necessary if (entry.isOlder(oldEntry)) { // A more recent Entry is already in this container this.put(oldEntry.toKelondroEntry()); // put it back return false; @@ -133,16 +133,16 @@ public class indexContainer extends kelondroRowSet { } } - public indexEntry get(String urlHash) { + public indexRWIEntry get(String urlHash) { kelondroRow.Entry entry = this.get(urlHash.getBytes()); if (entry == null) return null; - return new indexURLEntry(entry); + return new indexRWIEntryOld(entry); } - public indexEntry remove(String urlHash) { + public indexRWIEntry remove(String urlHash) { kelondroRow.Entry entry = this.remove(urlHash.getBytes()); if (entry == null) return null; - return new indexURLEntry(entry); + return new indexRWIEntryOld(entry); } public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) { @@ -178,7 +178,7 @@ public class indexContainer extends kelondroRowSet { public Object next() { kelondroRow.Entry rentry = (kelondroRow.Entry) rowEntryIterator.next(); if (rentry == null) return null; - return new indexURLEntry(rentry); + return new indexRWIEntryOld(rentry); } public void remove() { @@ -288,10 +288,10 @@ public class indexContainer extends kelondroRowSet { assert small.rowdef.equals(large.rowdef) : "small = " + small.rowdef.toString() + "; large = " + large.rowdef.toString(); indexContainer conj = new indexContainer(null, small.rowdef); // start with empty search result Iterator se = small.entries(); - indexEntry ie0, ie1; + indexRWIEntry ie0, ie1; long stamp = System.currentTimeMillis(); while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) { - ie0 = (indexEntry) se.next(); + ie0 = (indexRWIEntry) se.next(); ie1 = large.get(ie0.urlHash()); if (ie1 != null) { // this is a hit. Calculate word distance: @@ -312,25 +312,25 @@ public class indexContainer extends kelondroRowSet { Iterator e2 = i2.entries(); int c; if ((e1.hasNext()) && (e2.hasNext())) { - indexEntry ie1; - indexEntry ie2; - ie1 = (indexEntry) e1.next(); - ie2 = (indexEntry) e2.next(); + indexRWIEntry ie1; + indexRWIEntry ie2; + ie1 = (indexRWIEntry) e1.next(); + ie2 = (indexRWIEntry) e2.next(); long stamp = System.currentTimeMillis(); while ((System.currentTimeMillis() - stamp) < time) { c = i1.order().compare(ie1.urlHash(), ie2.urlHash()); //System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c); if (c < 0) { - if (e1.hasNext()) ie1 = (indexEntry) e1.next(); else break; + if (e1.hasNext()) ie1 = (indexRWIEntry) e1.next(); else break; } else if (c > 0) { - if (e2.hasNext()) ie2 = (indexEntry) e2.next(); else break; + if (e2.hasNext()) ie2 = (indexRWIEntry) e2.next(); else break; } else { // we have found the same urls in different searches! ie1.combineDistance(ie2); if (ie1.worddistance() <= maxDistance) conj.add(ie1); - if (e1.hasNext()) ie1 = (indexEntry) e1.next(); else break; - if (e2.hasNext()) ie2 = (indexEntry) e2.next(); else break; + if (e1.hasNext()) ie1 = (indexRWIEntry) e1.next(); else break; + if (e2.hasNext()) ie2 = (indexRWIEntry) e2.next(); else break; } } } diff --git a/source/de/anomic/index/indexEntryAttribute.java b/source/de/anomic/index/indexEntryAttribute.java index 229fc0ca4..2156cad4b 100644 --- a/source/de/anomic/index/indexEntryAttribute.java +++ b/source/de/anomic/index/indexEntryAttribute.java @@ -35,10 +35,6 @@ import de.anomic.yacy.yacySeedDB; public class indexEntryAttribute { - // the size of a word hash - public static final int wordHashLength = yacySeedDB.commonHashLength; // 12 - public static final int urlHashLength = yacySeedDB.commonHashLength; // 12 - // doctypes: public static final char DT_PDFPS = 'p'; public static final char DT_TEXT = 't'; @@ -86,7 +82,7 @@ public class indexEntryAttribute { // create a word hash public static String word2hash(String word) { - return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, indexEntryAttribute.wordHashLength); + return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, yacySeedDB.commonHashLength); } // doctype calculation diff --git a/source/de/anomic/index/indexRAMRI.java b/source/de/anomic/index/indexRAMRI.java index 0858ef8ba..ad00d3f28 100644 --- a/source/de/anomic/index/indexRAMRI.java +++ b/source/de/anomic/index/indexRAMRI.java @@ -81,7 +81,7 @@ public final class indexRAMRI implements indexRI { this.indexArrayFileName = dumpname; this.payloadrow = payloadrow; this.bufferStructureBasis = new kelondroRow( - "byte[] wordhash-" + indexEntryAttribute.wordHashLength + ", " + + "byte[] wordhash-" + yacySeedDB.commonHashLength + ", " + "Cardinal occ-4 {b256}, " + "Cardinal time-8 {b256}, " + "byte[] urlprops-" + payloadrow.objectsize()); @@ -114,7 +114,7 @@ public final class indexRAMRI implements indexRI { String wordHash; indexContainer container; long updateTime; - indexEntry iEntry; + indexRWIEntry iEntry; kelondroRow.Entry row = dumpArray.row().newEntry(); // write wCache @@ -131,7 +131,7 @@ public final class indexRAMRI implements indexRI { if (container != null) { Iterator ci = container.entries(); while (ci.hasNext()) { - iEntry = (indexEntry) ci.next(); + iEntry = (indexRWIEntry) ci.next(); row.setCol(0, wordHash.getBytes()); row.setCol(1, kelondroNaturalOrder.encodeLong(container.size(), 4)); row.setCol(2, kelondroNaturalOrder.encodeLong(updateTime, 8)); @@ -169,7 +169,7 @@ public final class indexRAMRI implements indexRI { Iterator i = dumpArray.contentRows(-1); String wordHash; //long creationTime; - indexEntry wordEntry; + indexRWIEntry wordEntry; kelondroRow.Entry row; //Runtime rt = Runtime.getRuntime(); while (i.hasNext()) { @@ -178,7 +178,7 @@ public final class indexRAMRI implements indexRI { if ((row == null) || (row.empty(0)) || (row.empty(3))) continue; wordHash = row.getColString(0, "UTF-8"); //creationTime = kelondroRecords.bytes2long(row[2]); - wordEntry = new indexURLEntry(row.getColBytes(3)); + wordEntry = new indexRWIEntryOld(row.getColBytes(3)); // store to cache addEntry(wordHash, wordEntry, startTime, false); urlCount++; @@ -437,10 +437,10 @@ public final class indexRAMRI implements indexRI { return null; } - public synchronized indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) { + public synchronized indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) { indexContainer container = (indexContainer) cache.get(wordHash); if (container == null) container = new indexContainer(wordHash, this.payloadrow); - indexEntry[] entries = new indexEntry[] { newEntry }; + indexRWIEntry[] entries = new indexRWIEntry[] { newEntry }; if (container.add(entries, updateTime) > 0) { cache.put(wordHash, container); hashScore.incScore(wordHash); diff --git a/source/de/anomic/index/indexRI.java b/source/de/anomic/index/indexRI.java index 43187cb02..9618e0303 100644 --- a/source/de/anomic/index/indexRI.java +++ b/source/de/anomic/index/indexRI.java @@ -44,7 +44,7 @@ public interface indexRI { public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete); public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete); - public indexContainer addEntry(String wordHash, indexEntry entry, long updateTime, boolean dhtCase); + public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtCase); public indexContainer addEntries(indexContainer newEntries, long creationTime, boolean dhtCase); public void close(int waitingSeconds); diff --git a/source/de/anomic/index/indexEntry.java b/source/de/anomic/index/indexRWIEntry.java similarity index 79% rename from source/de/anomic/index/indexEntry.java rename to source/de/anomic/index/indexRWIEntry.java index 8fc17aa22..f4e7caa84 100644 --- a/source/de/anomic/index/indexEntry.java +++ b/source/de/anomic/index/indexRWIEntry.java @@ -1,4 +1,4 @@ -// indexEntry.java +// indexRWIEntry.java // (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany // first published 20.05.2006 on http://www.anomic.de // @@ -28,7 +28,7 @@ package de.anomic.index; import de.anomic.kelondro.kelondroRow; -public interface indexEntry { +public interface indexRWIEntry { public Object clone(); public String toPropertyForm(boolean displayFormat); @@ -48,13 +48,13 @@ public interface indexEntry { public char getType(); public boolean isLocal(); - public void combineDistance(indexEntry oe); + public void combineDistance(indexRWIEntry oe); public int worddistance(); - public void min(indexEntry other); - public void max(indexEntry other); - public void normalize(indexEntry min, indexEntry max); - public indexEntry generateNormalized(indexEntry min, indexEntry max); - public boolean isNewer(indexEntry other); - public boolean isOlder(indexEntry other); + public void min(indexRWIEntry other); + public void max(indexRWIEntry other); + public void normalize(indexRWIEntry min, indexRWIEntry max); + public indexRWIEntry generateNormalized(indexRWIEntry min, indexRWIEntry max); + public boolean isNewer(indexRWIEntry other); + public boolean isOlder(indexRWIEntry other); } diff --git a/source/de/anomic/index/indexRWIEntryOld.java b/source/de/anomic/index/indexRWIEntryOld.java new file mode 100644 index 000000000..1461ad77e --- /dev/null +++ b/source/de/anomic/index/indexRWIEntryOld.java @@ -0,0 +1,323 @@ +// indexURLEntryNew.java +// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany +// first published 21.07.2006 on http://www.anomic.de +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +package de.anomic.index; + +import de.anomic.kelondro.kelondroColumn; +import de.anomic.kelondro.kelondroRow; +import de.anomic.kelondro.kelondroRow.Entry; +import de.anomic.plasma.plasmaWordIndex; +import de.anomic.yacy.yacySeedDB; + +public class indexRWIEntryOld implements Cloneable, indexRWIEntry { + + // this object stores attributes to URL references inside RWI collections + + // statics for value lengths + public static final int urlStringLength = 256;// not too short for links without parameters + public static final int urlDescrLength = 80; // The headline of a web page (meta-tag or

) + public static final int urlNameLength = 40; // the tag content between and + public static final int urldescrtagsLength = 320;// the url, the description and tags in one string + public static final int urlErrorLength = 80; // a reason description for unavailable urls + public static final int urlDateLength = 4; // any date, shortened + public static final int urlCopyCountLength = 2; // counter for numbers of copies of this index + public static final int urlFlagLength = 2; // any stuff + public static final int urlLanguageLength = 2; // taken from TLD suffix as quick-hack + public static final int urlDoctypeLength = 1; // taken from extension + public static final int urlSizeLength = 6; // the source size, from cache + public static final int urlWordCountLength = 3; // the number of words, from condenser + public static final int urlCrawlProfileHandleLength = 4; // name of the prefetch profile + public static final int urlCrawlDepthLength = 2; // prefetch depth, first is '0' + public static final int urlParentBranchesLength = 3; // number of anchors of the parent + public static final int urlForkFactorLength = 4; // sum of anchors of all ancestors + public static final int urlRetryLength = 2; // number of load retries + public static final int urlHostLength = 8; // the host as struncated name + public static final int urlHandleLength = 4; // a handle + public static final int urlQualityLength = 3; // taken from heuristic + + public static kelondroRow urlEntryRow = new kelondroRow(new kelondroColumn[]{ + new kelondroColumn("h", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, yacySeedDB.commonHashLength, "urlhash"), + new kelondroColumn("q", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, urlQualityLength, "quality"), + new kelondroColumn("a", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 3, "lastModified"), + new kelondroColumn("c", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "hitcount"), + new kelondroColumn("l", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, urlLanguageLength, "language"), + new kelondroColumn("d", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "doctype"), + new kelondroColumn("f", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "localflag"), + new kelondroColumn("t", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posintext"), + new kelondroColumn("r", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posinphrase"), + new kelondroColumn("o", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posofphrase"), + new kelondroColumn("i", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "worddistance"), + new kelondroColumn("w", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "wordcount"), + new kelondroColumn("p", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "phrasecount") + }); + + private static final int col_urlhash = 0; + private static final int col_quality = 1; + private static final int col_lastModified = 2; + private static final int col_hitcount = 3; + private static final int col_language = 4; + private static final int col_doctype = 5; + private static final int col_localflag = 6; + private static final int col_posintext = 7; + private static final int col_posinphrase = 8; + private static final int col_posofphrase = 9; + private static final int col_worddistance = 10; + private static final int col_wordcount = 11; + private static final int col_phrasecount = 12; + + + private kelondroRow.Entry entry; + + public indexRWIEntryOld(String urlHash, + int urlLength, // byte-length of complete URL + int urlComps, // number of path components + int titleLength, // length of description/length (longer are better?) + int hitcount, //*how often appears this word in the text + int wordcount, //*total number of words + int phrasecount, //*total number of phrases + int posintext, //*position of word in all words + int posinphrase, //*position of word in its phrase + int posofphrase, //*number of the phrase where word appears + int worddistance, //*word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search + int sizeOfPage, // # of bytes of the page + long lastmodified, //*last-modified time of the document where word appears + long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short + int quality, //*the entropy value + String language, //*(guessed) language of document + char doctype, //*type of document + int outlinksSame, // outlinks to same domain + int outlinksOther,// outlinks to other domain + boolean local //*flag shows that this index was generated locally; othervise its from a remote peer + ) { + + // more needed attributes: + // - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag, hervorhebungen, meta-tags, word in link etc + // - boolean: URL attributes + assert (urlHash.length() == 12) : "urlhash = " + urlHash; + if ((language == null) || (language.length() != urlLanguageLength)) language = "uk"; + this.entry = urlEntryRow.newEntry(); + this.entry.setCol(col_urlhash, urlHash, null); + this.entry.setCol(col_quality, quality); + this.entry.setCol(col_lastModified, lastmodified); + this.entry.setCol(col_hitcount, hitcount); + this.entry.setCol(col_language, language, null); + this.entry.setCol(col_doctype, (byte) doctype); + this.entry.setCol(col_localflag, (byte) ((local) ? indexEntryAttribute.LT_LOCAL : indexEntryAttribute.LT_GLOBAL)); + this.entry.setCol(col_posintext, posintext); + this.entry.setCol(col_posinphrase, posinphrase); + this.entry.setCol(col_posofphrase, posofphrase); + this.entry.setCol(col_worddistance, worddistance); + this.entry.setCol(col_wordcount, wordcount); + this.entry.setCol(col_phrasecount, phrasecount); + //System.out.println("DEBUG-NEWENTRY " + toPropertyForm()); + } + + public indexRWIEntryOld(String urlHash, String code) { + // the code is the external form of the row minus the leading urlHash entry + this.entry = urlEntryRow.newEntry((urlHash + code).getBytes()); + } + + public indexRWIEntryOld(String external) { + this.entry = urlEntryRow.newEntry(external); + } + + public indexRWIEntryOld(byte[] row) { + this.entry = urlEntryRow.newEntry(row); + } + + public indexRWIEntryOld(kelondroRow.Entry rentry) { + // FIXME: see if cloning is necessary + this.entry = rentry; + } + + public Object clone() { + byte[] b = new byte[urlEntryRow.objectsize()]; + System.arraycopy(entry.bytes(), 0, b, 0, urlEntryRow.objectsize()); + return new indexRWIEntryOld(b); + } + + public String toPropertyForm(boolean displayFormat) { + return entry.toPropertyForm(true, displayFormat, displayFormat); + } + + public Entry toKelondroEntry() { + return this.entry; + } + + public String urlHash() { + return this.entry.getColString(col_urlhash, null); + } + + public int quality() { + return (int) this.entry.getColLong(col_quality); + } + + public int virtualAge() { + return plasmaWordIndex.microDateDays(lastModified()); + } + + public long lastModified() { + return (int) this.entry.getColLong(col_lastModified); + } + + public int hitcount() { + return (int) this.entry.getColLong(col_hitcount); + } + + public int posintext() { + return (int) this.entry.getColLong(col_posintext); + } + + public int posinphrase() { + return (int) this.entry.getColLong(col_posinphrase); + } + + public int posofphrase() { + return (int) this.entry.getColLong(col_posofphrase); + } + + public int wordcount() { + return (int) this.entry.getColLong(col_wordcount); + } + + public int phrasecount() { + return (int) this.entry.getColLong(col_phrasecount); + } + + public String getLanguage() { + return this.entry.getColString(col_language, null); + } + + public char getType() { + return (char) this.entry.getColByte(col_doctype); + } + + public boolean isLocal() { + return this.entry.getColByte(col_localflag) == indexEntryAttribute.LT_LOCAL; + } + + public static indexRWIEntryOld combineDistance(indexRWIEntryOld ie1, indexRWIEntry ie2) { + // returns a modified entry of the first argument + ie1.entry.setCol(col_worddistance, ie1.worddistance() + ie2.worddistance() + Math.abs(ie1.posintext() - ie2.posintext())); + ie1.entry.setCol(col_posintext, Math.min(ie1.posintext(), ie2.posintext())); + ie1.entry.setCol(col_posinphrase, (ie1.posofphrase() == ie2.posofphrase()) ? ie1.posofphrase() : 0 /*unknown*/); + ie1.entry.setCol(col_posofphrase, Math.min(ie1.posofphrase(), ie2.posofphrase())); + ie1.entry.setCol(col_wordcount, (ie1.wordcount() + ie2.wordcount()) / 2); + return ie1; + } + + public void combineDistance(indexRWIEntry oe) { + combineDistance(this, oe); + } + + public int worddistance() { + return (int) this.entry.getColLong(col_worddistance); + } + + public static final void min(indexRWIEntryOld t, indexRWIEntry other) { + if (t.hitcount() > other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount()); + if (t.wordcount() > other.wordcount()) t.entry.setCol(col_wordcount, other.wordcount()); + if (t.phrasecount() > other.phrasecount()) t.entry.setCol(col_phrasecount, other.phrasecount()); + if (t.posintext() > other.posintext()) t.entry.setCol(col_posintext, other.posintext()); + if (t.posinphrase() > other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase()); + if (t.posofphrase() > other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase()); + if (t.worddistance() > other.worddistance()) t.entry.setCol(col_worddistance, other.worddistance()); + if (t.lastModified() > other.lastModified()) t.entry.setCol(col_lastModified, other.lastModified()); + if (t.quality() > other.quality()) t.entry.setCol(col_quality, other.quality()); + } + + public static final void max(indexRWIEntryOld t, indexRWIEntry other) { + if (t.hitcount() < other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount()); + if (t.wordcount() < other.wordcount()) t.entry.setCol(col_wordcount, other.wordcount()); + if (t.phrasecount() < other.phrasecount()) t.entry.setCol(col_phrasecount, other.phrasecount()); + if (t.posintext() < other.posintext()) t.entry.setCol(col_posintext, other.posintext()); + if (t.posinphrase() < other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase()); + if (t.posofphrase() < other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase()); + if (t.worddistance() < other.worddistance()) t.entry.setCol(col_worddistance, other.worddistance()); + if (t.lastModified() < other.lastModified()) t.entry.setCol(col_lastModified, other.lastModified()); + if (t.quality() < other.quality()) t.entry.setCol(col_quality, other.quality()); + } + + + public void min(indexRWIEntry other) { + min(this, other); + } + + public void max(indexRWIEntry other) { + max(this, other); + } + + static void normalize(indexRWIEntryOld t, indexRWIEntry min, indexRWIEntry max) { + assert (t.urlHash().length() == 12) : "turlhash = " + t.urlHash(); + assert (min.urlHash().length() == 12) : "minurlhash = " + min.urlHash(); + assert (max.urlHash().length() == 12) : "maxurlhash = " + max.urlHash(); + if (1 + max.worddistance() - min.worddistance() == 0) System.out.println("min = " + min.toPropertyForm(true) + "\nmax=" + max.toPropertyForm(true)); + //System.out.println("Normalize:\nentry = " + t.toPropertyForm(true)); + //System.out.println("min = " + min.toPropertyForm(true)); + //System.out.println("max = " + max.toPropertyForm(true)); + t.entry.setCol(col_hitcount , (t.hitcount() == 0) ? 0 : 1 + 255 * (t.hitcount() - min.hitcount() ) / (1 + max.hitcount() - min.hitcount())); + t.entry.setCol(col_wordcount , (t.wordcount() == 0) ? 0 : 1 + 255 * (t.wordcount() - min.wordcount() ) / (1 + max.wordcount() - min.wordcount())); + t.entry.setCol(col_phrasecount , (t.phrasecount() == 0) ? 0 : 1 + 255 * (t.phrasecount() - min.phrasecount() ) / (1 + max.phrasecount() - min.phrasecount())); + t.entry.setCol(col_posintext , (t.posintext() == 0) ? 0 : 1 + 255 * (t.posintext() - min.posintext() ) / (1 + max.posintext() - min.posintext())); + t.entry.setCol(col_posinphrase , (t.posinphrase() == 0) ? 0 : 1 + 255 * (t.posinphrase() - min.posinphrase() ) / (1 + max.posinphrase() - min.posinphrase())); + t.entry.setCol(col_posofphrase , (t.posofphrase() == 0) ? 0 : 1 + 255 * (t.posofphrase() - min.posofphrase() ) / (1 + max.posofphrase() - min.posofphrase())); + t.entry.setCol(col_worddistance , (t.worddistance() == 0) ? 0 : 1 + 255 * (t.worddistance() - min.worddistance()) / (1 + max.worddistance() - min.worddistance())); // FIXME: hier gibts ein division by zero, was nur sein kann wenn die Normalisierung nicht geklappt hat. + t.entry.setCol(col_lastModified , (t.lastModified() == 0) ? 0 : 1 + 255 * (t.lastModified() - min.lastModified()) / (1 + max.lastModified() - min.lastModified())); + t.entry.setCol(col_quality , (t.quality() == 0) ? 0 : 1 + 255 * (t.quality() - min.quality() ) / (1 + max.quality() - min.quality())); + //System.out.println("out = " + t.toPropertyForm(true)); + } + + public void normalize(indexRWIEntry min, indexRWIEntry max) { + normalize(this, min, max); + } + + public indexRWIEntry generateNormalized(indexRWIEntry min, indexRWIEntry max) { + assert (this.urlHash().length() == 12) : "this.urlhash = " + this.urlHash(); + indexRWIEntryOld e = (indexRWIEntryOld) this.clone(); + e.normalize(min, max); + return e; + } + + public boolean isNewer(indexRWIEntry other) { + if (other == null) return true; + if (this.lastModified() > other.lastModified()) return true; + if (this.lastModified() == other.lastModified()) { + if (this.quality() > other.quality()) return true; + } + return false; + } + + public boolean isOlder(indexRWIEntry other) { + if (other == null) return false; + if (this.lastModified() < other.lastModified()) return true; + if (this.lastModified() == other.lastModified()) { + if (this.quality() < other.quality()) return true; + } + return false; + } + +} diff --git a/source/de/anomic/index/indexURL.java b/source/de/anomic/index/indexURL.java index f67d0265b..fc2e09979 100644 --- a/source/de/anomic/index/indexURL.java +++ b/source/de/anomic/index/indexURL.java @@ -50,29 +50,6 @@ public class indexURL { // day formatter for entry export public static final SimpleDateFormat shortDayFormatter = new SimpleDateFormat("yyyyMMdd"); - // statics for value lengths - public static final int urlHashLength = yacySeedDB.commonHashLength; // 12 - public static final int urlStringLength = 256;// not too short for links without parameters - public static final int urlDescrLength = 80; // The headline of a web page (meta-tag or

) - public static final int urlNameLength = 40; // the tag content between and - public static final int urldescrtagsLength = 320;// the url, the description and tags in one string - public static final int urlErrorLength = 80; // a reason description for unavailable urls - public static final int urlDateLength = 4; // any date, shortened - public static final int urlCopyCountLength = 2; // counter for numbers of copies of this index - public static final int urlFlagLength = 2; // any stuff - public static final int urlQualityLength = 3; // taken from heuristic - public static final int urlLanguageLength = 2; // taken from TLD suffix as quick-hack - public static final int urlDoctypeLength = 1; // taken from extension - public static final int urlSizeLength = 6; // the source size, from cache - public static final int urlWordCountLength = 3; // the number of words, from condenser - public static final int urlCrawlProfileHandleLength = 4; // name of the prefetch profile - public static final int urlCrawlDepthLength = 2; // prefetch depth, first is '0' - public static final int urlParentBranchesLength = 3; // number of anchors of the parent - public static final int urlForkFactorLength = 4; // sum of anchors of all ancestors - public static final int urlRetryLength = 2; // number of load retries - public static final int urlHostLength = 8; // the host as struncated name - public static final int urlHandleLength = 4; // a handle - private static final String[] TLD_NorthAmericaOceania={ // primary english-speaking countries // english-speaking countries from central america are also included @@ -397,7 +374,7 @@ public class indexURL { static { // create a dummy hash dummyHash = ""; - for (int i = 0; i < urlHashLength; i++) dummyHash += "-"; + for (int i = 0; i < yacySeedDB.commonHashLength; i++) dummyHash += "-"; // assign TLD-ids and names insertTLDProps(TLD_EuropaRussia, 0); @@ -602,13 +579,13 @@ public class indexURL { public static final String oldurlHash(URL url) { if (url == null) return null; - String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(url.toNormalform())).substring(0, urlHashLength); + String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(url.toNormalform())).substring(0, yacySeedDB.commonHashLength); return hash; } public static final String oldurlHash(String url) throws MalformedURLException { if ((url == null) || (url.length() < 10)) return null; - String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(new URL(url).toNormalform())).substring(0, urlHashLength); + String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(new URL(url).toNormalform())).substring(0, yacySeedDB.commonHashLength); return hash; } @@ -618,10 +595,10 @@ public class indexURL { TreeMap doms = new TreeMap(); synchronized(inputContainer) { Iterator i = inputContainer.entries(); - indexEntry iEntry; + indexRWIEntry iEntry; String dom, paths; while (i.hasNext()) { - iEntry = (indexEntry) i.next(); + iEntry = (indexRWIEntry) i.next(); if ((excludeContainer != null) && (excludeContainer.get(iEntry.urlHash()) != null)) continue; // do not include urls that are in excludeContainer dom = iEntry.urlHash().substring(6); if ((paths = (String) doms.get(dom)) == null) { diff --git a/source/de/anomic/plasma/plasmaCrawlLURLEntry.java b/source/de/anomic/index/indexURLEntry.java similarity index 90% rename from source/de/anomic/plasma/plasmaCrawlLURLEntry.java rename to source/de/anomic/index/indexURLEntry.java index fd079efb3..6531210ff 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURLEntry.java +++ b/source/de/anomic/index/indexURLEntry.java @@ -1,6 +1,6 @@ -// plasmaCrawlLURLEntry.java +// indexURLEntry.java // (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany -// first published 13.10.2006 on http://www.anomic.de +// first published 2006 on http://www.anomic.de // // This is a part of YaCy, a peer-to-peer based web search engine // @@ -24,7 +24,8 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.plasma; + +package de.anomic.index; import java.io.IOException; import java.net.MalformedURLException; @@ -32,9 +33,9 @@ import java.util.Date; import de.anomic.kelondro.kelondroRow; import de.anomic.net.URL; -import de.anomic.index.indexEntry; +import de.anomic.index.indexRWIEntry; -public interface plasmaCrawlLURLEntry { +public interface indexURLEntry { public kelondroRow.Entry toRowEntry() throws IOException; public String hash(); @@ -48,8 +49,8 @@ public interface plasmaCrawlLURLEntry { public int size(); public int wordCount(); public String snippet(); - public indexEntry word(); - public boolean isOlder(plasmaCrawlLURLEntry other); + public indexRWIEntry word(); + public boolean isOlder(indexURLEntry other); public String toString(String snippet); public String toString(); @@ -82,4 +83,4 @@ public interface plasmaCrawlLURLEntry { public String ETag() { return this.ETag; } } -} +} \ No newline at end of file diff --git a/source/de/anomic/plasma/plasmaCrawlLURLNewEntry.java b/source/de/anomic/index/indexURLEntryNew.java similarity index 92% rename from source/de/anomic/plasma/plasmaCrawlLURLNewEntry.java rename to source/de/anomic/index/indexURLEntryNew.java index 9e1dd758b..2cc9dcf34 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURLNewEntry.java +++ b/source/de/anomic/index/indexURLEntryNew.java @@ -1,4 +1,4 @@ -package de.anomic.plasma; +package de.anomic.index; import java.io.IOException; import java.net.MalformedURLException; @@ -7,9 +7,6 @@ import java.util.Date; import java.util.Properties; import java.util.ArrayList; -import de.anomic.index.indexEntry; -import de.anomic.index.indexURL; -import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroRow; @@ -20,8 +17,10 @@ import de.anomic.tools.crypt; import de.anomic.tools.bitfield; import de.anomic.tools.nxTools; -public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry { +public class indexURLEntryNew implements indexURLEntry { + // this object stores attributes for URL entries + public static final kelondroRow rowdef = new kelondroRow( "String hash-12, " + // the url's hash "String comp-360, " + // components: the url, description, author and tags. As 5th element, an ETag is possible @@ -37,16 +36,16 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry { "String lang-2, " + // language "Cardinal llocal-2 {b256}, " + // # of outlinks to same domain; for video and image: width "Cardinal lother-2 {b256}, " + // # of outlinks to outside domain; for video and image: height - "Cardinal limage-2 {b256}, " + // # of embedded image links + "Cardinal limage-2 {b256}, " + // # of embedded image links "Cardinal laudio-2 {b256}, " + // # of embedded audio links; for audio: track number; for video: number of audio tracks "Cardinal lvideo-2 {b256}, " + // # of embedded video links - "Cardinal lapp-2 {b256}"); // # of embedded links to applications + "Cardinal lapp-2 {b256}"); // # of embedded links to applications private kelondroRow.Entry entry; private String snippet; - private indexEntry word; // this is only used if the url is transported via remote search requests + private indexRWIEntry word; // this is only used if the url is transported via remote search requests - public plasmaCrawlLURLNewEntry( + public indexURLEntryNew( URL url, String descr, String author, @@ -106,13 +105,13 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry { return s.toString().getBytes(); } - public plasmaCrawlLURLNewEntry(kelondroRow.Entry entry, indexEntry searchedWord) { + public indexURLEntryNew(kelondroRow.Entry entry, indexRWIEntry searchedWord) { this.entry = entry; this.snippet = null; this.word = searchedWord; } - public plasmaCrawlLURLNewEntry(Properties prop){ + public indexURLEntryNew(Properties prop){ // generates an plasmaLURLEntry using the properties from the argument // the property names must correspond to the one from toString //System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); @@ -159,12 +158,12 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry { this.entry.setCol("lvideo", Integer.parseInt(prop.getProperty("lvideo", "0"))); this.entry.setCol("lapp", Integer.parseInt(prop.getProperty("lapp", "0"))); this.snippet = crypt.simpleDecode(prop.getProperty("snippet", ""), null); - this.word = (prop.containsKey("word")) ? new indexURLEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))) : null; + this.word = (prop.containsKey("word")) ? new indexRWIEntryOld(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))) : null; } private StringBuffer corePropList() { // generate a parseable string; this is a simple property-list - plasmaCrawlLURLEntry.Components comp = this.comp(); + indexURLEntry.Components comp = this.comp(); final StringBuffer s = new StringBuffer(300); try { s.append("hash=").append(hash()); @@ -217,9 +216,9 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry { return this.entry.getColString("hash", "", null); } - public plasmaCrawlLURLEntry.Components comp() { + public indexURLEntry.Components comp() { ArrayList cl = nxTools.strings(this.entry.getCol("comp", null), "UTF-8"); - return new de.anomic.plasma.plasmaCrawlLURLEntry.Components( + return new indexURLEntry.Components( (cl.size() > 0) ? (String) cl.get(0) : "", (cl.size() > 1) ? (String) cl.get(1) : "", (cl.size() > 2) ? (String) cl.get(2) : "", @@ -299,11 +298,11 @@ public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry { return snippet; } - public indexEntry word() { + public indexRWIEntry word() { return word; } - public boolean isOlder(plasmaCrawlLURLEntry other) { + public boolean isOlder(indexURLEntry other) { if (other == null) return false; Date tmoddate = moddate(); Date omoddate = other.moddate(); diff --git a/source/de/anomic/plasma/plasmaCrawlLURLOldEntry.java b/source/de/anomic/index/indexURLEntryOld.java similarity index 84% rename from source/de/anomic/plasma/plasmaCrawlLURLOldEntry.java rename to source/de/anomic/index/indexURLEntryOld.java index 84fb66f66..4e0ca13d0 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURLOldEntry.java +++ b/source/de/anomic/index/indexURLEntryOld.java @@ -24,39 +24,37 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.plasma; +package de.anomic.index; import java.io.IOException; import java.util.Date; import java.util.Properties; import de.anomic.http.httpc; -import de.anomic.index.indexEntry; -import de.anomic.index.indexURL; -import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroRow; import de.anomic.net.URL; import de.anomic.server.logging.serverLog; import de.anomic.tools.bitfield; import de.anomic.tools.crypt; +import de.anomic.yacy.yacySeedDB; -public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { +public class indexURLEntryOld implements indexURLEntry { public static final kelondroRow rowdef = new kelondroRow( - "String urlhash-" + indexURL.urlHashLength + ", " + // the url's hash - "String urlstring-" + indexURL.urlStringLength + ", " + // the url as string - "String urldescr-" + indexURL.urlDescrLength + ", " + // the description of the url - "Cardinal moddate-" + indexURL.urlDateLength + " {b64e}, " + // last-modified from the httpd - "Cardinal loaddate-" + indexURL.urlDateLength + " {b64e}, " + // time when the url was loaded - "String refhash-" + indexURL.urlHashLength + ", " + // the url's referrer hash - "Cardinal copycount-" + indexURL.urlCopyCountLength + " {b64e}, " + // - "byte[] flags-" + indexURL.urlFlagLength + ", " + // flags - "Cardinal quality-" + indexURL.urlQualityLength + " {b64e}, " + // - "String language-" + indexURL.urlLanguageLength + ", " + // - "byte[] doctype-" + indexURL.urlDoctypeLength + ", " + // - "Cardinal size-" + indexURL.urlSizeLength + " {b64e}, " + // size of file in bytes - "Cardinal wc-" + indexURL.urlWordCountLength + " {b64e}"); // word count + "String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash + "String urlstring-" + indexRWIEntryOld.urlStringLength + ", " + // the url as string + "String urldescr-" + indexRWIEntryOld.urlDescrLength + ", " + // the description of the url + "Cardinal moddate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // last-modified from the httpd + "Cardinal loaddate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // time when the url was loaded + "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash + "Cardinal copycount-" + indexRWIEntryOld.urlCopyCountLength + " {b64e}, " + // + "byte[] flags-" + indexRWIEntryOld.urlFlagLength + ", " + // flags + "Cardinal quality-" + indexRWIEntryOld.urlQualityLength + " {b64e}, " + // + "String language-" + indexRWIEntryOld.urlLanguageLength + ", " + // + "byte[] doctype-" + indexRWIEntryOld.urlDoctypeLength + ", " + // + "Cardinal size-" + indexRWIEntryOld.urlSizeLength + " {b64e}, " + // size of file in bytes + "Cardinal wc-" + indexRWIEntryOld.urlWordCountLength + " {b64e}"); // word count private URL url; private String descr; @@ -72,9 +70,9 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { private int size; private int wordCount; private String snippet; - private indexEntry word; // this is only used if the url is transported via remote search requests + private indexRWIEntry word; // this is only used if the url is transported via remote search requests - public plasmaCrawlLURLOldEntry( + public indexURLEntryOld( URL url, String descr, String author, @@ -114,7 +112,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { this.word = null; } - public plasmaCrawlLURLOldEntry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException { + public indexURLEntryOld(kelondroRow.Entry entry, indexRWIEntry searchedWord) throws IOException { try { this.urlHash = entry.getColString(0, null); this.url = new URL(entry.getColString(1, "UTF-8")); @@ -138,7 +136,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { } } - public plasmaCrawlLURLOldEntry(Properties prop) { + public indexURLEntryOld(Properties prop) { // generates an plasmaLURLEntry using the properties from the argument // the property names must correspond to the one from toString //System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); @@ -161,7 +159,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { this.snippet = prop.getProperty("snippet", ""); if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null); - this.word = (prop.containsKey("word")) ? new indexURLEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))) : null; + this.word = (prop.containsKey("word")) ? new indexRWIEntryOld(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))) : null; } catch (Exception e) { serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/2:" @@ -178,8 +176,8 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { } public kelondroRow.Entry toRowEntry() throws IOException { - final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, indexURL.urlDateLength); - final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexURL.urlDateLength); + final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength); + final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength); final byte[][] entry = new byte[][] { urlHash.getBytes(), @@ -188,13 +186,13 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { moddatestr.getBytes(), loaddatestr.getBytes(), referrerHash.getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(copyCount, indexURL.urlCopyCountLength).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(copyCount, indexRWIEntryOld.urlCopyCountLength).getBytes(), flags.getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(quality, indexURL.urlQualityLength).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(quality, indexRWIEntryOld.urlQualityLength).getBytes(), language.getBytes(), new byte[] { (byte) doctype }, - kelondroBase64Order.enhancedCoder.encodeLong(size, indexURL.urlSizeLength).getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(wordCount, indexURL.urlWordCountLength).getBytes()}; + kelondroBase64Order.enhancedCoder.encodeLong(size, indexRWIEntryOld.urlSizeLength).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(wordCount, indexRWIEntryOld.urlWordCountLength).getBytes()}; return rowdef.newEntry(entry); } @@ -264,11 +262,11 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { return snippet; } - public indexEntry word() { + public indexRWIEntry word() { return word; } - public boolean isOlder(plasmaCrawlLURLEntry other) { + public boolean isOlder(indexURLEntry other) { if (other == null) return false; if (moddate.before(other.moddate())) return true; if (moddate.equals(other.moddate())) { @@ -292,7 +290,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { ",local=").append(((local()) ? "true" : "false")) .append(",q=").append( kelondroBase64Order.enhancedCoder.encodeLong( - quality, indexURL.urlQualityLength)) + quality, indexRWIEntryOld.urlQualityLength)) .append(",dt=").append(doctype).append(",lang=").append( language).append(",url=").append( crypt.simpleEncode(url.toString())).append( diff --git a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java index 6960ea857..95caa46c3 100644 --- a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java +++ b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java @@ -51,6 +51,7 @@ import java.io.File; import java.io.IOException; import de.anomic.index.indexURL; +import de.anomic.index.indexRWIEntryOld; import de.anomic.net.URL; import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlLoaderMessage; @@ -297,7 +298,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW yacyCore.seedDB.mySeed.hash, this.name, (failreason==null)?"Unknown reason":failreason, - new bitfield(indexURL.urlFlagLength) + new bitfield(indexRWIEntryOld.urlFlagLength) ); // store the entry diff --git a/source/de/anomic/plasma/dbImport/AssortmentImporter.java b/source/de/anomic/plasma/dbImport/AssortmentImporter.java index 86183dde1..20a5640eb 100644 --- a/source/de/anomic/plasma/dbImport/AssortmentImporter.java +++ b/source/de/anomic/plasma/dbImport/AssortmentImporter.java @@ -5,7 +5,7 @@ import java.io.IOException; import java.util.Iterator; import de.anomic.index.indexContainer; -import de.anomic.index.indexURLEntry; +import de.anomic.index.indexRWIEntryOld; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndexAssortment; @@ -63,7 +63,7 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{ // initializing the import assortment db this.log.logInfo("Initializing source assortment file"); try { - this.assortmentFile = new plasmaWordIndexAssortment(importAssortmentPath, indexURLEntry.urlEntryRow, assortmentNr, this.cacheSize/1024, preloadTime, this.log); + this.assortmentFile = new plasmaWordIndexAssortment(importAssortmentPath, indexRWIEntryOld.urlEntryRow, assortmentNr, this.cacheSize/1024, preloadTime, this.log); } catch (IOException e) { e.printStackTrace(); System.exit(-1); diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java index 6528a8bd9..4026e022a 100644 --- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java @@ -7,10 +7,10 @@ import java.util.Iterator; import java.util.TreeSet; import de.anomic.index.indexContainer; -import de.anomic.index.indexEntry; +import de.anomic.index.indexRWIEntry; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.plasma.plasmaCrawlLURL; -import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndex; import de.anomic.server.serverDate; @@ -134,13 +134,13 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { // loop throug the entities of the container and get the // urlhash Iterator importWordIdxEntries = newContainer.entries(); - indexEntry importWordIdxEntry; + indexRWIEntry importWordIdxEntry; while (importWordIdxEntries.hasNext()) { // testing if import process was aborted if (isAborted()) break; // getting next word index entry - importWordIdxEntry = (indexEntry) importWordIdxEntries.next(); + importWordIdxEntry = (indexRWIEntry) importWordIdxEntries.next(); String urlHash = importWordIdxEntry.urlHash(); entityUrls.add(urlHash); } @@ -162,7 +162,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { // we need to import the url // getting the url entry - plasmaCrawlLURLEntry urlEntry = this.importUrlDB.load(urlHash, null); + indexURLEntry urlEntry = this.importUrlDB.load(urlHash, null); if (urlEntry != null) { /* write it into the home url db */ diff --git a/source/de/anomic/plasma/plasmaCrawlBalancer.java b/source/de/anomic/plasma/plasmaCrawlBalancer.java index 81280bea0..ccde8689f 100644 --- a/source/de/anomic/plasma/plasmaCrawlBalancer.java +++ b/source/de/anomic/plasma/plasmaCrawlBalancer.java @@ -48,10 +48,10 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; -import de.anomic.index.indexURL; import de.anomic.kelondro.kelondroRecords; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroStack; +import de.anomic.yacy.yacySeedDB; public class plasmaCrawlBalancer { @@ -59,7 +59,7 @@ public class plasmaCrawlBalancer { private HashMap domainStacks; public plasmaCrawlBalancer(File stackFile) { - stack = kelondroStack.open(stackFile, new kelondroRow("byte[] urlhash-" + indexURL.urlHashLength)); + stack = kelondroStack.open(stackFile, new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength)); domainStacks = new HashMap(); } diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java index 74dc6c6f8..0bb32a489 100644 --- a/source/de/anomic/plasma/plasmaCrawlEURL.java +++ b/source/de/anomic/plasma/plasmaCrawlEURL.java @@ -54,12 +54,14 @@ import java.util.Iterator; import java.util.LinkedList; import de.anomic.index.indexURL; +import de.anomic.index.indexRWIEntryOld; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroFlexTable; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroTree; import de.anomic.net.URL; import de.anomic.tools.bitfield; +import de.anomic.yacy.yacySeedDB; public class plasmaCrawlEURL extends indexURL { @@ -134,17 +136,17 @@ public class plasmaCrawlEURL extends indexURL { public plasmaCrawlEURL(File cachePath, int bufferkb, long preloadTime, boolean newdb) { super(); kelondroRow rowdef = new kelondroRow( - "String urlhash-" + urlHashLength + ", " + // the url's hash - "String refhash-" + urlHashLength + ", " + // the url's referrer hash - "String initiator-" + urlHashLength + ", " + // the crawling initiator - "String executor-" + urlHashLength + ", " + // the crawling executor - "String urlstring-" + urlStringLength + ", " + // the url as string - "String urlname-" + urlNameLength + ", " + // the name of the url, from anchor tag name - "Cardinal appdate-" + urlDateLength + " {b64e}, " + // the time when the url was first time appeared - "Cardinal loaddate-" + urlDateLength + " {b64e}, " + // the time when the url was last time tried to load - "Cardinal retrycount-" + urlRetryLength + " {b64e}, " + // number of load retries - "String failcause-" + urlErrorLength + ", " + // string describing load failure - "byte[] flags-" + urlFlagLength); // extra space + "String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash + "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash + "String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator + "String executor-" + yacySeedDB.commonHashLength + ", " + // the crawling executor + "String urlstring-" + indexRWIEntryOld.urlStringLength + ", " + // the url as string + "String urlname-" + indexRWIEntryOld.urlNameLength + ", " + // the name of the url, from anchor tag name + "Cardinal appdate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // the time when the url was first time appeared + "Cardinal loaddate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // the time when the url was last time tried to load + "Cardinal retrycount-" + indexRWIEntryOld.urlRetryLength + " {b64e}, " + // number of load retries + "String failcause-" + indexRWIEntryOld.urlErrorLength + ", " + // string describing load failure + "byte[] flags-" + indexRWIEntryOld.urlFlagLength); // extra space if (newdb) { String newCacheName = "urlErr3.table"; @@ -164,9 +166,9 @@ public class plasmaCrawlEURL extends indexURL { public synchronized Entry newEntry(URL url, String referrer, String initiator, String executor, String name, String failreason, bitfield flags) { - if ((referrer == null) || (referrer.length() < urlHashLength)) referrer = dummyHash; - if ((initiator == null) || (initiator.length() < urlHashLength)) initiator = dummyHash; - if ((executor == null) || (executor.length() < urlHashLength)) executor = dummyHash; + if ((referrer == null) || (referrer.length() < yacySeedDB.commonHashLength)) referrer = dummyHash; + if ((initiator == null) || (initiator.length() < yacySeedDB.commonHashLength)) initiator = dummyHash; + if ((executor == null) || (executor.length() < yacySeedDB.commonHashLength)) executor = dummyHash; if (failreason == null) failreason = "unknown"; return new Entry(url, referrer, initiator, executor, name, failreason, flags); } @@ -289,8 +291,8 @@ public class plasmaCrawlEURL extends indexURL { // stores the values from the object variables into the database if (this.stored) return; if (this.hash == null) return; - String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, urlDateLength); - String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, urlDateLength); + String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, indexRWIEntryOld.urlDateLength); + String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, indexRWIEntryOld.urlDateLength); // store the hash in the hash cache try { @@ -304,7 +306,7 @@ public class plasmaCrawlEURL extends indexURL { this.name.getBytes(), initdatestr.getBytes(), trydatestr.getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.trycount, urlRetryLength).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(this.trycount, indexRWIEntryOld.urlRetryLength).getBytes(), this.failreason.getBytes(), this.flags.getBytes() }; diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 459f229f6..a592cf8c2 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -55,17 +55,18 @@ package de.anomic.plasma; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; -import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; -import java.util.Locale; import de.anomic.http.httpc; import de.anomic.http.httpc.response; -import de.anomic.index.indexEntry; +import de.anomic.index.indexRWIEntry; import de.anomic.index.indexURL; +import de.anomic.index.indexURLEntry; +import de.anomic.index.indexURLEntryNew; +import de.anomic.index.indexURLEntryOld; import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroFlexSplitTable; import de.anomic.kelondro.kelondroBase64Order; @@ -74,12 +75,9 @@ import de.anomic.kelondro.kelondroTree; import de.anomic.net.URL; import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.serverCodings; -import de.anomic.server.serverObjects; import de.anomic.server.logging.serverLog; import de.anomic.tools.bitfield; -import de.anomic.tools.nxTools; -import de.anomic.yacy.yacyCore; -import de.anomic.yacy.yacySeed; +import de.anomic.yacy.yacySeedDB; public final class plasmaCrawlLURL extends indexURL { @@ -101,11 +99,11 @@ public final class plasmaCrawlLURL extends indexURL { try { if (newdb) { - urlIndexFile = new kelondroFlexSplitTable(new File(indexPath, "PUBLIC/TEXT"), "urls", bufferkb * 0x400, preloadTime, plasmaCrawlLURLNewEntry.rowdef, kelondroBase64Order.enhancedCoder); + urlIndexFile = new kelondroFlexSplitTable(new File(indexPath, "PUBLIC/TEXT"), "urls", bufferkb * 0x400, preloadTime, indexURLEntryNew.rowdef, kelondroBase64Order.enhancedCoder); } else { File oldLURLDB = new File(plasmaPath, "urlHash.db"); oldLURLDB.getParentFile().mkdirs(); - urlIndexFile = new kelondroCache(new kelondroTree(oldLURLDB, bufferkb / 2 * 0x400, preloadTime, plasmaCrawlLURLOldEntry.rowdef), bufferkb / 2 * 0x400, true, false); + urlIndexFile = new kelondroCache(new kelondroTree(oldLURLDB, bufferkb / 2 * 0x400, preloadTime, indexURLEntryOld.rowdef), bufferkb / 2 * 0x400, true, false); } } catch (IOException e) { e.printStackTrace(); @@ -121,7 +119,7 @@ public final class plasmaCrawlLURL extends indexURL { gcrawlResultStack = new LinkedList(); } - public synchronized void stack(plasmaCrawlLURLEntry e, String initiatorHash, String executorHash, int stackType) { + public synchronized void stack(indexURLEntry e, String initiatorHash, String executorHash, int stackType) { if (e == null) { return; } try { if (initiatorHash == null) { initiatorHash = dummyHash; } @@ -159,7 +157,7 @@ public final class plasmaCrawlLURL extends indexURL { return 0; } - public synchronized plasmaCrawlLURLEntry load(String urlHash, indexEntry searchedWord) { + public synchronized indexURLEntry load(String urlHash, indexRWIEntry searchedWord) { // generates an plasmaLURLEntry using the url hash // to speed up the access, the url-hashes are buffered // in the hash cache. @@ -171,17 +169,17 @@ public final class plasmaCrawlLURL extends indexURL { kelondroRow.Entry entry = urlIndexFile.get(urlHash.getBytes()); if (entry == null) return null; if (newdb) - return new plasmaCrawlLURLNewEntry(entry, searchedWord); + return new indexURLEntryNew(entry, searchedWord); else - return new plasmaCrawlLURLOldEntry(entry, searchedWord); + return new indexURLEntryOld(entry, searchedWord); } catch (IOException e) { return null; } } - public synchronized void store(plasmaCrawlLURLEntry entry) throws IOException { + public synchronized void store(indexURLEntry entry) throws IOException { // Check if there is a more recent Entry already in the DB - plasmaCrawlLURLEntry oldEntry; + indexURLEntry oldEntry; try { if (exists(entry.hash())) { oldEntry = load(entry.hash(), null); @@ -202,18 +200,18 @@ public final class plasmaCrawlLURL extends indexURL { urlIndexFile.put(entry.toRowEntry(), entry.loaddate()); } - public synchronized plasmaCrawlLURLEntry newEntry(String propStr) { + public synchronized indexURLEntry newEntry(String propStr) { if (propStr.startsWith("{") && propStr.endsWith("}")) { if (newdb) - return new plasmaCrawlLURLNewEntry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1))); + return new indexURLEntryNew(serverCodings.s2p(propStr.substring(1, propStr.length() - 1))); else - return new plasmaCrawlLURLOldEntry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1))); + return new indexURLEntryOld(serverCodings.s2p(propStr.substring(1, propStr.length() - 1))); } else { return null; } } - public synchronized plasmaCrawlLURLEntry newEntry( + public synchronized indexURLEntry newEntry( URL url, String descr, String author, @@ -236,10 +234,10 @@ public final class plasmaCrawlLURL extends indexURL { int lvideo, int lapp) { if (newdb) - return new plasmaCrawlLURLNewEntry(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5, + return new indexURLEntryNew(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5, size, wc, dt, flags, lang, llocal, lother, laudio, limage, lvideo, lapp); else - return new plasmaCrawlLURLOldEntry(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5, + return new indexURLEntryOld(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5, size, wc, dt, flags, lang, llocal, lother, laudio, limage, lvideo, lapp); } @@ -257,36 +255,36 @@ public final class plasmaCrawlLURL extends indexURL { public synchronized String getUrlHash(int stack, int pos) { switch (stack) { - case 1: return ((String) externResultStack.get(pos)).substring(0, urlHashLength); - case 2: return ((String) searchResultStack.get(pos)).substring(0, urlHashLength); - case 3: return ((String) transfResultStack.get(pos)).substring(0, urlHashLength); - case 4: return ((String) proxyResultStack.get(pos)).substring(0, urlHashLength); - case 5: return ((String) lcrawlResultStack.get(pos)).substring(0, urlHashLength); - case 6: return ((String) gcrawlResultStack.get(pos)).substring(0, urlHashLength); + case 1: return ((String) externResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength); + case 2: return ((String) searchResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength); + case 3: return ((String) transfResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength); + case 4: return ((String) proxyResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength); + case 5: return ((String) lcrawlResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength); + case 6: return ((String) gcrawlResultStack.get(pos)).substring(0, yacySeedDB.commonHashLength); } return null; } public synchronized String getInitiatorHash(int stack, int pos) { switch (stack) { - case 1: return ((String) externResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2); - case 2: return ((String) searchResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2); - case 3: return ((String) transfResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2); - case 4: return ((String) proxyResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2); - case 5: return ((String) lcrawlResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2); - case 6: return ((String) gcrawlResultStack.get(pos)).substring(urlHashLength, urlHashLength * 2); + case 1: return ((String) externResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2); + case 2: return ((String) searchResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2); + case 3: return ((String) transfResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2); + case 4: return ((String) proxyResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2); + case 5: return ((String) lcrawlResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2); + case 6: return ((String) gcrawlResultStack.get(pos)).substring(yacySeedDB.commonHashLength, yacySeedDB.commonHashLength * 2); } return null; } public synchronized String getExecutorHash(int stack, int pos) { switch (stack) { - case 1: return ((String) externResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3); - case 2: return ((String) searchResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3); - case 3: return ((String) transfResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3); - case 4: return ((String) proxyResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3); - case 5: return ((String) lcrawlResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3); - case 6: return ((String) gcrawlResultStack.get(pos)).substring(urlHashLength * 2, urlHashLength * 3); + case 1: return ((String) externResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3); + case 2: return ((String) searchResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3); + case 3: return ((String) transfResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3); + case 4: return ((String) proxyResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3); + case 5: return ((String) lcrawlResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3); + case 6: return ((String) gcrawlResultStack.get(pos)).substring(yacySeedDB.commonHashLength * 2, yacySeedDB.commonHashLength * 3); } return null; } @@ -341,88 +339,10 @@ public final class plasmaCrawlLURL extends indexURL { return false; } } - - private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); - private static String daydate(Date date) { - if (date == null) { - return ""; - } else { - return dayFormatter.format(date); - } - } - public serverObjects genTableProps(int tabletype, int lines, boolean showInit, boolean showExec, String dfltInit, String dfltExec, String feedbackpage, boolean makeLink) { -/* serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps tabletype=" + tabletype + " lines=" + lines + - " showInit=" + showInit + " showExec=" + showExec + - " dfltInit=" + dfltInit + " dfltExec=" + dfltExec + - " feedbackpage=" + feedbackpage + " makeLink=" + makeLink); */ - final serverObjects prop = new serverObjects(); - if (getStackSize(tabletype) == 0) { - prop.put("table", 0); - return prop; - } - prop.put("table", 1); - if (lines > getStackSize(tabletype)) lines = getStackSize(tabletype); - if (lines == getStackSize(tabletype)) { - prop.put("table_size", 0); - } else { - prop.put("table_size", 1); - prop.put("table_size_count", lines); - } - prop.put("table_size_all", getStackSize(tabletype)); - prop.put("table_feedbackpage", feedbackpage); - prop.put("table_tabletype", tabletype); - prop.put("table_showInit", (showInit) ? 1 : 0); - prop.put("table_showExec", (showExec) ? 1 : 0); - - boolean dark = true; - String urlHash, initiatorHash, executorHash; - String cachepath, urlstr, urltxt; - yacySeed initiatorSeed, executorSeed; - plasmaCrawlLURLEntry urle; - - // needed for getCachePath(url) - final plasmaSwitchboard switchboard = plasmaSwitchboard.getSwitchboard(); - final plasmaHTCache cacheManager = switchboard.getCacheManager(); - - int i, cnt = 0; - for (i = getStackSize(tabletype) - 1; i >= (getStackSize(tabletype) - lines); i--) { - initiatorHash = getInitiatorHash(tabletype, i); - executorHash = getExecutorHash(tabletype, i); -// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps initiatorHash=" + initiatorHash + " executorHash=" + executorHash); - urlHash = getUrlHash(tabletype, i); -// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash); - try { - urle = load(urlHash, null); - plasmaCrawlLURLEntry.Components comp = urle.comp(); -// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString()); - initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash); - executorSeed = yacyCore.seedDB.getConnected(executorHash); - - urlstr = comp.url().toNormalform(); - urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL - cachepath = cacheManager.getCachePath(new URL(urlstr)).toString().replace('\\', '/').substring(cacheManager.cachePath.toString().length() + 1); - - prop.put("table_indexed_" + cnt + "_dark", (dark) ? 1 : 0); - prop.put("table_indexed_" + cnt + "_feedbackpage", feedbackpage); - prop.put("table_indexed_" + cnt + "_tabletype", tabletype); - prop.put("table_indexed_" + cnt + "_urlhash", urlHash); - prop.put("table_indexed_" + cnt + "_showInit", (showInit) ? 1 : 0); - prop.put("table_indexed_" + cnt + "_showInit_initiatorSeed", (initiatorSeed == null) ? dfltInit : initiatorSeed.getName()); - prop.put("table_indexed_" + cnt + "_showExec", (showExec) ? 1 : 0); - prop.put("table_indexed_" + cnt + "_showExec_executorSeed", (executorSeed == null) ? dfltExec : executorSeed.getName()); - prop.put("table_indexed_" + cnt + "_moddate", daydate(urle.moddate())); - prop.put("table_indexed_" + cnt + "_wordcount", urle.wordCount()); - prop.put("table_indexed_" + cnt + "_urldescr", comp.descr()); - prop.put("table_indexed_" + cnt + "_url", (cachepath == null) ? "-not-cached-" : ((makeLink) ? ("" + urltxt + "") : urlstr)); - dark = !dark; - cnt++; - } catch (Exception e) { - serverLog.logSevere("PLASMA", "genTableProps", e); - } - } - prop.put("table_indexed", cnt); - return prop; + public Iterator entries(boolean up, boolean rotating, String firstHash) throws IOException { + // enumerates entry elements + return new kiter(up, rotating, firstHash); } public class kiter implements Iterator { @@ -445,9 +365,9 @@ public final class plasmaCrawlLURL extends indexURL { if (e == null) return null; try { if (newdb) - return new plasmaCrawlLURLNewEntry(e, null); + return new indexURLEntryNew(e, null); else - return new plasmaCrawlLURLOldEntry(e, null); + return new indexURLEntryOld(e, null); } catch (IOException ex) { throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null)); } @@ -459,11 +379,6 @@ public final class plasmaCrawlLURL extends indexURL { } - public Iterator entries(boolean up, boolean rotating, String firstHash) throws IOException { - // enumerates entry elements - return new kiter(up, rotating, firstHash); - } - /** * Uses an Iteration over urlHash.db to detect malformed URL-Entries. * Damaged URL-Entries will be marked in a HashSet and removed at the end of the function. @@ -578,8 +493,8 @@ public final class plasmaCrawlLURL extends indexURL { } } - plasmaCrawlLURLEntry entry = (plasmaCrawlLURLEntry) eiter.next(); - plasmaCrawlLURLEntry.Components comp = entry.comp(); + indexURLEntry entry = (indexURLEntry) eiter.next(); + indexURLEntry.Components comp = entry.comp(); totalSearchedUrls++; if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, comp.url()) || plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, comp.url())) { @@ -650,7 +565,7 @@ public final class plasmaCrawlLURL extends indexURL { final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), new File(args[2]), 1, 0, false); final Iterator enu = urls.entries(true, false, null); while (enu.hasNext()) { - System.out.println(((plasmaCrawlLURLEntry) enu.next()).toString()); + System.out.println(((indexURLEntry) enu.next()).toString()); } } catch (Exception e) { e.printStackTrace(); diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index a3b88c45c..9e07210df 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -51,6 +51,7 @@ import java.util.HashSet; import java.util.Iterator; import de.anomic.index.indexURL; +import de.anomic.index.indexRWIEntryOld; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroException; @@ -62,6 +63,7 @@ import de.anomic.kelondro.kelondroTree; import de.anomic.net.URL; import de.anomic.server.logging.serverLog; import de.anomic.tools.bitfield; +import de.anomic.yacy.yacySeedDB; public class plasmaCrawlNURL extends indexURL { @@ -78,18 +80,18 @@ public class plasmaCrawlNURL extends indexURL { * column length definition for the {@link plasmaURL#urlIndexFile} DB */ public final static kelondroRow rowdef = new kelondroRow( - "String urlhash-" + urlHashLength + ", " + // the url's hash - "String initiator-" + urlHashLength + ", " + // the crawling initiator - "String urlstring-" + urlStringLength + ", " + // the url as string - "String refhash-" + urlHashLength + ", " + // the url's referrer hash - "String urlname-" + urlNameLength + ", " + // the name of the url, from anchor tag name - "Cardinal appdate-" + urlDateLength + " {b64e}, " + // the time when the url was first time appeared - "String profile-" + urlCrawlProfileHandleLength + ", " + // the name of the prefetch profile handle - "Cardinal depth-" + urlCrawlDepthLength + " {b64e}, " + // the prefetch depth so far, starts at 0 - "Cardinal parentbr-" + urlParentBranchesLength + " {b64e}, " + // number of anchors of the parent - "Cardinal forkfactor-" + urlForkFactorLength + " {b64e}, " + // sum of anchors of all ancestors - "byte[] flags-" + urlFlagLength + ", " + // flags - "String handle-" + urlHandleLength); // extra handle + "String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash + "String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator + "String urlstring-" + indexRWIEntryOld.urlStringLength + ", " + // the url as string + "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash + "String urlname-" + indexRWIEntryOld.urlNameLength + ", " + // the name of the url, from anchor tag name + "Cardinal appdate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // the time when the url was first time appeared + "String profile-" + indexRWIEntryOld.urlCrawlProfileHandleLength + ", " + // the name of the prefetch profile handle + "Cardinal depth-" + indexRWIEntryOld.urlCrawlDepthLength + " {b64e}, " + // the prefetch depth so far, starts at 0 + "Cardinal parentbr-" + indexRWIEntryOld.urlParentBranchesLength + " {b64e}, " + // number of anchors of the parent + "Cardinal forkfactor-" + indexRWIEntryOld.urlForkFactorLength + " {b64e}, " + // sum of anchors of all ancestors + "byte[] flags-" + indexRWIEntryOld.urlFlagLength + ", " + // flags + "String handle-" + indexRWIEntryOld.urlHandleLength); // extra handle private final plasmaCrawlBalancer coreStack; // links found by crawling to depth-1 private final plasmaCrawlBalancer limitStack; // links found by crawling at target depth @@ -128,7 +130,7 @@ public class plasmaCrawlNURL extends indexURL { limitStack = new plasmaCrawlBalancer(limitStackFile); overhangStack = new plasmaCrawlBalancer(overhangStackFile); remoteStack = new plasmaCrawlBalancer(remoteStackFile); - kelondroRow rowdef = new kelondroRow("byte[] urlhash-" + indexURL.urlHashLength); + kelondroRow rowdef = new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength); imageStack = kelondroStack.open(imageStackFile, rowdef); movieStack = kelondroStack.open(movieStackFile, rowdef); musicStack = kelondroStack.open(musicStackFile, rowdef); @@ -257,7 +259,7 @@ public class plasmaCrawlNURL extends indexURL { private static String normalizeHandle(int h) { String d = Integer.toHexString(h); - while (d.length() < urlHandleLength) d = "0" + d; + while (d.length() < indexRWIEntryOld.urlHandleLength) d = "0" + d; return d; } @@ -479,7 +481,7 @@ public class plasmaCrawlNURL extends indexURL { this.depth = depth; this.anchors = anchors; this.forkfactor = forkfactor; - this.flags = new bitfield(urlFlagLength); + this.flags = new bitfield(indexRWIEntryOld.urlFlagLength); this.handle = 0; this.stored = false; } @@ -533,7 +535,7 @@ public class plasmaCrawlNURL extends indexURL { public void store() { // stores the values from the object variables into the database if (this.stored) return; - String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength); + String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength); // store the hash in the hash cache try { // even if the entry exists, we simply overwrite it @@ -545,9 +547,9 @@ public class plasmaCrawlNURL extends indexURL { this.name.getBytes("UTF-8"), loaddatestr.getBytes(), (this.profileHandle == null) ? null : this.profileHandle.getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.depth, urlCrawlDepthLength).getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, urlParentBranchesLength).getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, urlForkFactorLength).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(this.depth, indexRWIEntryOld.urlCrawlDepthLength).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, indexRWIEntryOld.urlParentBranchesLength).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, indexRWIEntryOld.urlForkFactorLength).getBytes(), this.flags.getBytes(), normalizeHandle(this.handle).getBytes() }; diff --git a/source/de/anomic/plasma/plasmaCrawlProfile.java b/source/de/anomic/plasma/plasmaCrawlProfile.java index e14c3dee0..b8bfeffbf 100644 --- a/source/de/anomic/plasma/plasmaCrawlProfile.java +++ b/source/de/anomic/plasma/plasmaCrawlProfile.java @@ -48,7 +48,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.Map; -import de.anomic.index.indexURL; +import de.anomic.index.indexRWIEntryOld; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroDyn; import de.anomic.kelondro.kelondroException; @@ -68,7 +68,7 @@ public class plasmaCrawlProfile { this.bufferkb = bufferkb; this.preloadTime = preloadTime; profileTableFile.getParentFile().mkdirs(); - kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, indexURL.urlCrawlProfileHandleLength, 2000, '#'); + kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, indexRWIEntryOld.urlCrawlProfileHandleLength, 2000, '#'); profileTable = new kelondroMap(dyn); domsCache = new HashMap(); } @@ -94,7 +94,7 @@ public class plasmaCrawlProfile { if (profileTable != null) try { profileTable.close(); } catch (IOException e) {} if (!(profileTableFile.delete())) throw new RuntimeException("cannot delete crawl profile database"); profileTableFile.getParentFile().mkdirs(); - kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, indexURL.urlCrawlProfileHandleLength, 2000, '#'); + kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, indexRWIEntryOld.urlCrawlProfileHandleLength, 2000, '#'); profileTable = new kelondroMap(dyn); } @@ -256,7 +256,7 @@ public class plasmaCrawlProfile { boolean storeHTCache, boolean storeTXCache, boolean localIndexing, boolean remoteIndexing, boolean xsstopw, boolean xdstopw, boolean xpstopw) { - String handle = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, indexURL.urlCrawlProfileHandleLength); + String handle = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, indexRWIEntryOld.urlCrawlProfileHandleLength); mem = new HashMap(); mem.put("handle", handle); mem.put("name", name); diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index a4d3bebf1..27fc0d9ed 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -60,6 +60,8 @@ import org.apache.commons.pool.impl.GenericObjectPool; import de.anomic.data.robotsParser; import de.anomic.http.httpc; import de.anomic.index.indexURL; +import de.anomic.index.indexRWIEntryOld; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroException; @@ -391,7 +393,7 @@ public final class plasmaCrawlStacker { checkInterruption(); String nexturlhash = indexURL.urlHash(nexturl); String dbocc = this.sb.urlPool.exists(nexturlhash); - plasmaCrawlLURLEntry oldEntry = null; + indexURLEntry oldEntry = null; oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null); boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder()); if ((dbocc != null) && (!(recrawl))) { @@ -490,7 +492,7 @@ public final class plasmaCrawlStacker { this.depth = depth; this.anchors = anchors; this.forkfactor = forkfactor; - this.flags = new bitfield(indexURL.urlFlagLength); + this.flags = new bitfield(indexRWIEntryOld.urlFlagLength); this.handle = 0; } catch (Exception e) { e.printStackTrace(); @@ -573,7 +575,7 @@ public final class plasmaCrawlStacker { public byte[][] getBytes() { // stores the values from the object variables into the database - String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexURL.urlDateLength); + String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength); // store the hash in the hash cache // even if the entry exists, we simply overwrite it @@ -587,9 +589,9 @@ public final class plasmaCrawlStacker { this.name.getBytes("UTF-8"), loaddatestr.getBytes(), (this.profileHandle == null) ? null : this.profileHandle.getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.depth, indexURL.urlCrawlDepthLength).getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, indexURL.urlParentBranchesLength).getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, indexURL.urlForkFactorLength).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(this.depth, indexRWIEntryOld.urlCrawlDepthLength).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, indexRWIEntryOld.urlParentBranchesLength).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, indexRWIEntryOld.urlForkFactorLength).getBytes(), this.flags.getBytes(), normalizeHandle(this.handle).getBytes() }; @@ -599,7 +601,7 @@ public final class plasmaCrawlStacker { private String normalizeHandle(int h) { String d = Integer.toHexString(h); - while (d.length() < indexURL.urlHandleLength) d = "0" + d; + while (d.length() < indexRWIEntryOld.urlHandleLength) d = "0" + d; return d; } } @@ -1057,7 +1059,7 @@ public final class plasmaCrawlStacker { yacyCore.seedDB.mySeed.hash, this.theMsg.name, rejectReason, - new bitfield(indexURL.urlFlagLength) + new bitfield(indexRWIEntryOld.urlFlagLength) ); ee.store(); sb.urlPool.errorURL.stackPushEntry(ee); diff --git a/source/de/anomic/plasma/plasmaDHTChunk.java b/source/de/anomic/plasma/plasmaDHTChunk.java index 82873c20d..5fce764b7 100644 --- a/source/de/anomic/plasma/plasmaDHTChunk.java +++ b/source/de/anomic/plasma/plasmaDHTChunk.java @@ -48,7 +48,8 @@ import java.util.HashSet; import java.util.Iterator; import de.anomic.index.indexContainer; -import de.anomic.index.indexEntry; +import de.anomic.index.indexRWIEntry; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroException; import de.anomic.server.serverCodings; @@ -200,8 +201,8 @@ public class plasmaDHTChunk { Iterator indexContainerIterator = wordIndex.indexContainerSet(hash, resourceLevel, true, maxcount).iterator(); indexContainer container; Iterator urlIter; - indexEntry iEntry; - plasmaCrawlLURLEntry lurl; + indexRWIEntry iEntry; + indexURLEntry lurl; int refcount = 0; int wholesize; @@ -227,7 +228,7 @@ public class plasmaDHTChunk { urlIter = container.entries(); // iterate over indexes to fetch url entries and store them in the urlCache while ((urlIter.hasNext()) && (maxcount > refcount) && (System.currentTimeMillis() < timeout)) { - iEntry = (indexEntry) urlIter.next(); + iEntry = (indexRWIEntry) urlIter.next(); lurl = lurls.load(iEntry.urlHash(), iEntry); if ((lurl == null) || (lurl.comp().url() == null)) { //yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash()); @@ -243,7 +244,7 @@ public class plasmaDHTChunk { // remove all remaining; we have enough while (urlIter.hasNext()) { - iEntry = (indexEntry) urlIter.next(); + iEntry = (indexRWIEntry) urlIter.next(); urlIter.remove(); } @@ -285,7 +286,7 @@ public class plasmaDHTChunk { public synchronized String deleteTransferIndexes() { Iterator urlIter; - indexEntry iEntry; + indexRWIEntry iEntry; HashSet urlHashes; String count = "0"; @@ -299,7 +300,7 @@ public class plasmaDHTChunk { urlHashes = new HashSet(this.indexContainers[i].size()); urlIter = this.indexContainers[i].entries(); while (urlIter.hasNext()) { - iEntry = (indexEntry) urlIter.next(); + iEntry = (indexRWIEntry) urlIter.next(); urlHashes.add(iEntry.urlHash()); } String wordHash = indexContainers[i].getWordHash(); diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 496cd430e..701fa0318 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -90,6 +90,7 @@ import de.anomic.server.serverThread; import de.anomic.server.logging.serverLog; import de.anomic.tools.enumerateFiles; import de.anomic.yacy.yacySeed; +import de.anomic.yacy.yacySeedDB; public final class plasmaHTCache { @@ -173,7 +174,7 @@ public final class plasmaHTCache { // open the response header database File dbfile = new File(this.cachePath, "responseHeader.db"); try { - this.responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400, preloadTime, indexURL.urlHashLength, 150, '#')); + this.responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400, preloadTime, yacySeedDB.commonHashLength, 150, '#')); } catch (IOException e) { this.log.logSevere("the request header database could not be opened: " + e.getMessage()); System.exit(0); @@ -717,7 +718,7 @@ public final class plasmaHTCache { if (hexHash.indexOf('.') >= 0) return null; try { String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.decodeHex(hexHash)); - if (hash.length() == indexURL.urlHashLength) return hash; + if (hash.length() == yacySeedDB.commonHashLength) return hash; return null; } catch (Exception e) { //log.logWarning("getHash: " + e.getMessage(), e); diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 07345cf12..39def7504 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -51,7 +51,8 @@ import java.util.Set; import java.util.TreeMap; import de.anomic.index.indexContainer; -import de.anomic.index.indexEntry; +import de.anomic.index.indexRWIEntry; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroMSetTools; import de.anomic.server.logging.serverLog; @@ -379,8 +380,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable { //if (searchResult == null) return acc; // strange case where searchResult is not proper: acc is then empty //if (searchResult.size() == 0) return acc; // case that we have nothing to do - indexEntry entry; - plasmaCrawlLURLEntry page; + indexRWIEntry entry; + indexURLEntry page; Long preranking; Object[] preorderEntry; int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT); @@ -388,7 +389,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { while (preorder.hasNext()) { if ((System.currentTimeMillis() >= postorderLimitTime) && (acc.sizeFetched() >= minEntries)) break; preorderEntry = preorder.next(); - entry = (indexEntry) preorderEntry[0]; + entry = (indexRWIEntry) preorderEntry[0]; // load only urls if there was not yet a root url of that hash preranking = (Long) preorderEntry[1]; // find the url entry @@ -425,11 +426,11 @@ public final class plasmaSearchEvent extends Thread implements Runnable { preorder.remove(true, true); // start url-fetch - indexEntry entry; + indexRWIEntry entry; try { while (preorder.hasNext()) { if (System.currentTimeMillis() >= timeout) break; - entry = (indexEntry) (preorder.next()[0]); + entry = (indexRWIEntry) (preorder.next()[0]); // find and fetch the url entry urlStore.load(entry.urlHash(), entry); } diff --git a/source/de/anomic/plasma/plasmaSearchImages.java b/source/de/anomic/plasma/plasmaSearchImages.java index 78a834304..1b80f4ba2 100644 --- a/source/de/anomic/plasma/plasmaSearchImages.java +++ b/source/de/anomic/plasma/plasmaSearchImages.java @@ -48,6 +48,7 @@ import java.util.Map; import java.util.TreeSet; import de.anomic.htmlFilter.htmlFilterImageEntry; +import de.anomic.index.indexURLEntry; import de.anomic.net.URL; import de.anomic.plasma.parser.ParserException; import de.anomic.server.serverDate; @@ -101,7 +102,7 @@ public final class plasmaSearchImages { public plasmaSearchImages(plasmaSnippetCache sc, long maxTime, plasmaSearchResult sres, int depth) { long start = System.currentTimeMillis(); this.images = new TreeSet(); - plasmaCrawlLURLEntry urlentry; + indexURLEntry urlentry; while (sres.hasMoreElements()) { urlentry = sres.nextElement(); addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), urlentry.comp().url(), depth)); diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java index 4985859f4..5b50aa0c9 100644 --- a/source/de/anomic/plasma/plasmaSearchPreOrder.java +++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java @@ -50,7 +50,7 @@ import java.util.Map; import java.util.TreeMap; import de.anomic.index.indexContainer; -import de.anomic.index.indexEntry; +import de.anomic.index.indexRWIEntry; import de.anomic.index.indexURL; import de.anomic.kelondro.kelondroBinSearch; import de.anomic.server.serverCodings; @@ -61,7 +61,7 @@ public final class plasmaSearchPreOrder { public static kelondroBinSearch[] ybrTables = null; // block-rank tables private static boolean useYBR = true; - private indexEntry entryMin, entryMax; + private indexRWIEntry entryMin, entryMax; private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry private plasmaSearchQuery query; private plasmaSearchRankingProfile ranking; @@ -79,7 +79,7 @@ public final class plasmaSearchPreOrder { this.ranking = ranking; long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; - indexEntry iEntry; + indexRWIEntry iEntry; // first pass: find min/max to obtain limits for normalization Iterator i = container.entries(); @@ -88,9 +88,9 @@ public final class plasmaSearchPreOrder { this.entryMax = null; while (i.hasNext()) { if (System.currentTimeMillis() > limitTime) break; - iEntry = (indexEntry) i.next(); - if (this.entryMin == null) this.entryMin = (indexEntry) iEntry.clone(); else this.entryMin.min(iEntry); - if (this.entryMax == null) this.entryMax = (indexEntry) iEntry.clone(); else this.entryMax.max(iEntry); + iEntry = (indexRWIEntry) i.next(); + if (this.entryMin == null) this.entryMin = (indexRWIEntry) iEntry.clone(); else this.entryMin.min(iEntry); + if (this.entryMax == null) this.entryMax = (indexRWIEntry) iEntry.clone(); else this.entryMax.max(iEntry); count++; } @@ -98,7 +98,7 @@ public final class plasmaSearchPreOrder { i = container.entries(); this.pageAcc = new TreeMap(); for (int j = 0; j < count; j++) { - iEntry = (indexEntry) i.next(); + iEntry = (indexRWIEntry) i.next(); pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), query.words("")), 16) + iEntry.urlHash(), iEntry); } } @@ -110,13 +110,13 @@ public final class plasmaSearchPreOrder { HashSet doubleDoms = new HashSet(); Iterator i = pageAcc.entrySet().iterator(); Map.Entry entry; - indexEntry iEntry; + indexRWIEntry iEntry; String hashpart; boolean isWordRootURL; while (i.hasNext()) { if (pageAcc.size() <= query.wantedResults) break; entry = (Map.Entry) i.next(); - iEntry = (indexEntry) entry.getValue(); + iEntry = (indexRWIEntry) entry.getValue(); hashpart = iEntry.urlHash().substring(6); isWordRootURL = indexURL.isWordRootURL(iEntry.urlHash(), query.words("")); if ((!(isWordRootURL)) && @@ -192,11 +192,11 @@ public final class plasmaSearchPreOrder { e.printStackTrace(); preranking = new Long(0); } - return new Object[]{(indexEntry) pageAcc.remove(top), preranking}; + return new Object[]{(indexRWIEntry) pageAcc.remove(top), preranking}; } - public indexEntry[] getNormalizer() { - return new indexEntry[] {entryMin, entryMax}; + public indexRWIEntry[] getNormalizer() { + return new indexRWIEntry[] {entryMin, entryMax}; } public static int ybr_p(String urlHash) { diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index cd4874827..03a7230cc 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -51,6 +51,7 @@ import de.anomic.htmlFilter.htmlFilterAbstractScraper; import de.anomic.index.indexEntryAttribute; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.server.serverCharBuffer; +import de.anomic.yacy.yacySeedDB; public final class plasmaSearchQuery { @@ -120,16 +121,16 @@ public final class plasmaSearchQuery { public static Set hashes2Set(String query) { if (query == null) return new HashSet(); - final HashSet keyhashes = new HashSet(query.length() / indexEntryAttribute.wordHashLength); - for (int i = 0; i < (query.length() / indexEntryAttribute.wordHashLength); i++) { - keyhashes.add(query.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength)); + final HashSet keyhashes = new HashSet(query.length() / yacySeedDB.commonHashLength); + for (int i = 0; i < (query.length() / yacySeedDB.commonHashLength); i++) { + keyhashes.add(query.substring(i * yacySeedDB.commonHashLength, (i + 1) * yacySeedDB.commonHashLength)); } return keyhashes; } public static String hashSet2hashString(Set words) { Iterator i = words.iterator(); - StringBuffer sb = new StringBuffer(words.size() * indexEntryAttribute.wordHashLength); + StringBuffer sb = new StringBuffer(words.size() * yacySeedDB.commonHashLength); while (i.hasNext()) sb.append((String) i.next()); return new String(sb); } diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java index b628ed45b..2ad91fc4f 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java @@ -46,8 +46,9 @@ import java.util.Iterator; import java.util.Map; import java.util.Set; -import de.anomic.index.indexEntry; +import de.anomic.index.indexRWIEntry; import de.anomic.index.indexURL; +import de.anomic.index.indexURLEntry; public class plasmaSearchRankingProfile { @@ -164,7 +165,7 @@ public class plasmaSearchRankingProfile { return new String(ext); } - public long preRanking(indexEntry normalizedEntry, String searchedWord) { + public long preRanking(indexRWIEntry normalizedEntry, String searchedWord) { // the normalizedEntry must be a normalized indexEntry long ranking = 0; ranking += normalizedEntry.quality() << ((Integer) coeff.get(ENTROPY)).intValue(); @@ -191,13 +192,13 @@ public class plasmaSearchRankingProfile { Set topwords, String[] urlcomps, String[] descrcomps, - plasmaCrawlLURLEntry page) { + indexURLEntry page) { // apply pre-calculated order attributes long ranking = preranking; // prefer hit with 'prefer' pattern - plasmaCrawlLURLEntry.Components comp = page.comp(); + indexURLEntry.Components comp = page.comp(); if (comp.url().toNormalform().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue(); if (comp.descr().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue(); diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java index 0878c2350..8e5265fe0 100644 --- a/source/de/anomic/plasma/plasmaSearchResult.java +++ b/source/de/anomic/plasma/plasmaSearchResult.java @@ -54,6 +54,7 @@ import java.util.TreeMap; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexURL; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.net.URL; import de.anomic.server.serverCodings; @@ -99,16 +100,16 @@ public final class plasmaSearchResult { return pageAcc.size() > 0; } - public plasmaCrawlLURLEntry nextElement() { + public indexURLEntry nextElement() { Object top = pageAcc.firstKey(); //System.out.println("postorder-key: " + ((String) top)); - return (plasmaCrawlLURLEntry) pageAcc.remove(top); + return (indexURLEntry) pageAcc.remove(top); } - protected void addResult(plasmaCrawlLURLEntry page, Long preranking) { + protected void addResult(indexURLEntry page, Long preranking) { // take out relevant information for reference computation - plasmaCrawlLURLEntry.Components comp = page.comp(); + indexURLEntry.Components comp = page.comp(); if ((comp.url() == null) || (comp.descr() == null)) return; String[] urlcomps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()); // word components of the url String[] descrcomps = comp.descr().toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description @@ -131,12 +132,12 @@ public final class plasmaSearchResult { for (int i = 0; i < references.length; i++) commonSense.add(references[i]); Object[] resultVector; - plasmaCrawlLURLEntry page; + indexURLEntry page; long ranking; for (int i = 0; i < results.size(); i++) { // take out values from result array resultVector = (Object[]) results.get(i); - page = (plasmaCrawlLURLEntry) resultVector[0]; + page = (indexURLEntry) resultVector[0]; // calculate ranking if (postsort) @@ -172,7 +173,7 @@ public final class plasmaSearchResult { // first scan all entries and find all urls that are referenced while (i.hasNext()) { entry = (Map.Entry) i.next(); - path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).comp().url()); + path = urlPath(((indexURLEntry) entry.getValue()).comp().url()); paths.put(path, entry.getKey()); //if (path != null) path = shortenPath(path); //if (path != null) paths.put(path, entry.getKey()); @@ -183,7 +184,7 @@ public final class plasmaSearchResult { String shorten; while (i.hasNext()) { entry = (Map.Entry) i.next(); - path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).comp().url()); + path = urlPath(((indexURLEntry) entry.getValue()).comp().url()); shorten = shortenPath(path); // scan all subpaths of the url while (shorten != null) { diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 503570692..76e798362 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -58,6 +58,7 @@ import de.anomic.http.httpHeader; import de.anomic.http.httpc; import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexURL; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.net.URL; import de.anomic.plasma.cache.IResourceInfo; @@ -630,12 +631,12 @@ public class plasmaSnippetCache { public void fetch(plasmaSearchResult acc, Set queryhashes, String urlmask, int fetchcount, long maxTime) { // fetch snippets int i = 0; - plasmaCrawlLURLEntry urlentry; + indexURLEntry urlentry; String urlstring; long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) { urlentry = acc.nextElement(); - plasmaCrawlLURLEntry.Components comp = urlentry.comp(); + indexURLEntry.Components comp = urlentry.comp(); if (comp.url().getHost().endsWith(".yacyh")) continue; urlstring = comp.url().toNormalform(); if ((urlstring.matches(urlmask)) && diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 457fff15c..572120f86 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -132,9 +132,10 @@ import de.anomic.http.httpHeader; import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpc; import de.anomic.index.indexContainer; -import de.anomic.index.indexEntry; +import de.anomic.index.indexRWIEntry; import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexURL; +import de.anomic.index.indexRWIEntryOld; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroException; @@ -1429,14 +1430,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } private plasmaParserDocument parseResource(plasmaSwitchboardQueue.Entry entry, String initiatorHash) throws InterruptedException, ParserException { - plasmaParserDocument document = null; - + // the mimetype of this entry String mimeType = entry.getMimeType(); String charset = entry.getCharacterEncoding(); // the parser logger - serverLog parserLogger = parser.getLogger(); + //serverLog parserLogger = parser.getLogger(); // parse the document return parseResource(entry.url(), mimeType, charset, entry.cacheFile()); @@ -1497,7 +1497,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (document == null) return; } catch (ParserException e) { this.log.logInfo("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage()); - addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorPeerHash, entry.anchorName(), e.getErrorCode(), new bitfield(indexURL.urlFlagLength)); + addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorPeerHash, entry.anchorName(), e.getErrorCode(), new bitfield(indexRWIEntryOld.urlFlagLength)); if (document != null) { document.close(); document = null; @@ -1574,7 +1574,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser checkInterruption(); // create a new loaded URL db entry - plasmaCrawlLURLEntry newEntry = urlPool.loadedURL.newEntry( + indexURLEntry newEntry = urlPool.loadedURL.newEntry( entry.url(), // URL docDescription, // document description "", // author @@ -1660,7 +1660,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String language = indexEntryAttribute.language(entry.url()); char doctype = indexEntryAttribute.docType(document.getMimeType()); - plasmaCrawlLURLEntry.Components comp = newEntry.comp(); + indexURLEntry.Components comp = newEntry.comp(); int urlLength = comp.url().toNormalform().length(); int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()).length; @@ -1673,7 +1673,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String word = (String) wentry.getKey(); wordStat = (plasmaCondenser.wordStatProp) wentry.getValue(); String wordHash = indexEntryAttribute.word2hash(word); - indexEntry wordIdxEntry = new indexURLEntry( + indexRWIEntry wordIdxEntry = new indexRWIEntryOld( urlHash, urlLength, urlComps, wordStat.count, @@ -1764,7 +1764,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } } else { log.logFine("Not Indexed Resource '" + entry.normalizedURLString() + "': process case=" + processCase); - addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new bitfield(indexURL.urlFlagLength)); + addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new bitfield(indexRWIEntryOld.urlFlagLength)); } } catch (Exception ee) { if (ee instanceof InterruptedException) throw (InterruptedException)ee; @@ -1776,7 +1776,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) { yacyClient.crawlReceipt(initiatorPeer, "crawl", "exception", ee.getMessage(), null, ""); } - addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new bitfield(indexURL.urlFlagLength)); + addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new bitfield(indexRWIEntryOld.urlFlagLength)); } } else { @@ -1784,7 +1784,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser checkInterruption(); log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason); - addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, noIndexReason, new bitfield(indexURL.urlFlagLength)); + addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, noIndexReason, new bitfield(indexRWIEntryOld.urlFlagLength)); if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) { yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, ""); } @@ -1991,7 +1991,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String lurl = (String) page.get("lurl"); if ((lurl != null) && (lurl.length() != 0)) { String propStr = crypt.simpleDecode(lurl, (String) page.get("key")); - plasmaCrawlLURLEntry entry = urlPool.loadedURL.newEntry(propStr); + indexURLEntry entry = urlPool.loadedURL.newEntry(propStr); urlPool.loadedURL.store(entry); urlPool.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt? urlPool.noticeURL.remove(entry.hash()); @@ -2070,7 +2070,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser prop.put("type_globalresults", acc.globalContributions); int i = 0; int p; - plasmaCrawlLURLEntry urlentry; + indexURLEntry urlentry; String urlstring, urlname, filename, urlhash; String host, hash, address; yacySeed seed; @@ -2081,7 +2081,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (targetTime < System.currentTimeMillis()) targetTime = System.currentTimeMillis() + 1000; while ((acc.hasMoreElements()) && (i < query.wantedResults) && (System.currentTimeMillis() < targetTime)) { urlentry = acc.nextElement(); - plasmaCrawlLURLEntry.Components comp = urlentry.comp(); + indexURLEntry.Components comp = urlentry.comp(); urlhash = urlentry.hash(); assert (urlhash != null); assert (urlhash.length() == 12) : "urlhash = " + urlhash; @@ -2218,9 +2218,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // finally, delete the url entry // determine the url string - plasmaCrawlLURLEntry entry = urlPool.loadedURL.load(urlhash, null); + indexURLEntry entry = urlPool.loadedURL.load(urlhash, null); if (entry == null) return 0; - plasmaCrawlLURLEntry.Components comp = entry.comp(); + indexURLEntry.Components comp = entry.comp(); if (comp.url() == null) return 0; InputStream resourceContent = null; diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java index 584d1ff53..b7d4893e1 100644 --- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java +++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java @@ -51,6 +51,8 @@ import java.util.ArrayList; import java.util.Date; import de.anomic.index.indexURL; +import de.anomic.index.indexRWIEntryOld; +import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroRow; @@ -79,14 +81,14 @@ public class plasmaSwitchboardQueue { private void initQueueStack() { kelondroRow rowdef = new kelondroRow( - "String url-" + indexURL.urlStringLength + ", " + // the url - "String refhash-" + indexURL.urlHashLength + ", " + // the url's referrer hash - "Cardinal modifiedsince-11" + " {b64e}, " + // from ifModifiedSince - "byte[] flags-1" + ", " + // flags - "String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator - "Cardinal depth-" + indexURL.urlCrawlDepthLength + " {b64e}, " + // the prefetch depth so far, starts at 0 - "String profile-" + indexURL.urlCrawlProfileHandleLength + ", " + // the name of the prefetch profile handle - "String urldescr-" + indexURL.urlDescrLength); // + "String url-" + yacySeedDB.commonHashLength + ", " + // the url + "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash + "Cardinal modifiedsince-11" + " {b64e}, " + // from ifModifiedSince + "byte[] flags-1" + ", " + // flags + "String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator + "Cardinal depth-" + indexRWIEntryOld.urlCrawlDepthLength + " {b64e}, " + // the prefetch depth so far, starts at 0 + "String profile-" + indexRWIEntryOld.urlCrawlProfileHandleLength + ", " + // the name of the prefetch profile handle + "String urldescr-" + indexRWIEntryOld.urlDescrLength); // sbQueueStack = kelondroStack.open(sbQueueStackPath, rowdef); } @@ -108,7 +110,7 @@ public class plasmaSwitchboardQueue { kelondroBase64Order.enhancedCoder.encodeLong((entry.ifModifiedSince == null) ? 0 : entry.ifModifiedSince.getTime(), 11).getBytes(), new byte[]{entry.flags}, (entry.initiator == null) ? indexURL.dummyHash.getBytes() : entry.initiator.getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong((long) entry.depth, indexURL.urlCrawlDepthLength).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong((long) entry.depth, indexRWIEntryOld.urlCrawlDepthLength).getBytes(), (entry.profileHandle == null) ? indexURL.dummyHash.getBytes() : entry.profileHandle.getBytes(), (entry.anchorName == null) ? "-".getBytes("UTF-8") : entry.anchorName.getBytes("UTF-8") })); @@ -333,7 +335,7 @@ public class plasmaSwitchboardQueue { public URL referrerURL() { if (referrerURL == null) { if ((referrerHash == null) || (referrerHash.equals(indexURL.dummyHash))) return null; - plasmaCrawlLURLEntry entry = lurls.load(referrerHash, null); + indexURLEntry entry = lurls.load(referrerHash, null); if (entry == null) referrerURL = null; else referrerURL = entry.comp().url(); } return referrerURL; diff --git a/source/de/anomic/plasma/plasmaURLPool.java b/source/de/anomic/plasma/plasmaURLPool.java index f861d748a..08ecb0e65 100644 --- a/source/de/anomic/plasma/plasmaURLPool.java +++ b/source/de/anomic/plasma/plasmaURLPool.java @@ -48,6 +48,7 @@ import java.io.File; import java.io.IOException; import de.anomic.index.indexURL; +import de.anomic.index.indexURLEntry; import de.anomic.net.URL; public class plasmaURLPool { @@ -83,7 +84,7 @@ public class plasmaURLPool { plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash); if (ne != null) return ne.url(); } catch (IOException e) {} - plasmaCrawlLURLEntry le = loadedURL.load(urlhash, null); + indexURLEntry le = loadedURL.load(urlhash, null); if (le != null) return le.comp().url(); plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash); if (ee != null) return ee.url(); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 8ab5079c4..805dc49ea 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -40,10 +40,11 @@ import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.index.indexCollectionRI; import de.anomic.index.indexContainer; import de.anomic.index.indexContainerOrder; -import de.anomic.index.indexEntry; +import de.anomic.index.indexRWIEntry; import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexRAMRI; import de.anomic.index.indexRI; +import de.anomic.index.indexRWIEntryOld; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroException; @@ -60,7 +61,7 @@ public final class plasmaWordIndex implements indexRI { private static final String indexAssortmentClusterPath = "ACLUSTER"; private static final int assortmentCount = 64; - private static final kelondroRow payloadrow = indexURLEntry.urlEntryRow; + private static final kelondroRow payloadrow = indexRWIEntryOld.urlEntryRow; private final File oldDatabaseRoot; private final kelondroOrder indexOrder = new kelondroNaturalOrder(true); @@ -201,7 +202,7 @@ public final class plasmaWordIndex implements indexRI { return new indexContainer(wordHash, payloadrow); } - public indexContainer addEntry(String wordHash, indexEntry entry, long updateTime, boolean dhtInCase) { + public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtInCase) { // set dhtInCase depending on wordHash if ((!dhtInCase) && (yacyDHTAction.shallBeOwnWord(wordHash))) dhtInCase = true; @@ -318,7 +319,7 @@ public final class plasmaWordIndex implements indexRI { Iterator i = condenser.words(); Map.Entry wentry; String word; - indexEntry ientry; + indexRWIEntry ientry; plasmaCondenser.wordStatProp wprop; String wordHash; int urlLength = url.toString().length(); @@ -330,7 +331,7 @@ public final class plasmaWordIndex implements indexRI { wprop = (plasmaCondenser.wordStatProp) wentry.getValue(); // if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c); wordHash = indexEntryAttribute.word2hash(word); - ientry = new indexURLEntry(urlHash, + ientry = new indexRWIEntryOld(urlHash, urlLength, urlComps, (document == null) ? urlLength : document.getMainLongTitle().length(), wprop.count, condenser.RESULT_SIMI_WORDS, @@ -685,11 +686,11 @@ public final class plasmaWordIndex implements indexRI { // the combined container will fit, read the container try { Iterator entries = entity.elements(true); - indexEntry entry; + indexRWIEntry entry; while (entries.hasNext()) { - entry = (indexEntry) entries.next(); + entry = (indexRWIEntry) entries.next(); // System.out.println("ENTRY = " + entry.getUrlHash()); - container.add(new indexEntry[]{entry}, System.currentTimeMillis()); + container.add(new indexRWIEntry[]{entry}, System.currentTimeMillis()); } // we have read all elements, now delete the entity entity.deleteComplete(); @@ -723,11 +724,11 @@ public final class plasmaWordIndex implements indexRI { try { Iterator entries = entity.elements(true); - indexEntry entry; + indexRWIEntry entry; while (entries.hasNext()) { - entry = (indexEntry) entries.next(); + entry = (indexRWIEntry) entries.next(); // System.out.println("ENTRY = " + entry.getUrlHash()); - container.add(new indexEntry[] { entry }, System.currentTimeMillis()); + container.add(new indexRWIEntry[] { entry }, System.currentTimeMillis()); } // we have read all elements, now delete the entity entity.deleteComplete(); @@ -775,7 +776,7 @@ public final class plasmaWordIndex implements indexRI { public void run() { serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread started"); indexContainer container = null; - indexEntry entry = null; + indexRWIEntry entry = null; URL url = null; HashSet urlHashs = new HashSet(); try { @@ -787,9 +788,9 @@ public final class plasmaWordIndex implements indexRI { wordHashNow = container.getWordHash(); while (containerIterator.hasNext() && run) { waiter(); - entry = (indexEntry) containerIterator.next(); + entry = (indexRWIEntry) containerIterator.next(); // System.out.println("Wordhash: "+wordHash+" UrlHash: "+entry.getUrlHash()); - plasmaCrawlLURLEntry ue = lurl.load(entry.urlHash(), null); + indexURLEntry ue = lurl.load(entry.urlHash(), null); if (ue == null) { urlHashs.add(entry.urlHash()); } else { diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortment.java b/source/de/anomic/plasma/plasmaWordIndexAssortment.java index 10215e064..85356539a 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortment.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortment.java @@ -57,15 +57,15 @@ import java.io.IOException; import java.util.Iterator; import de.anomic.index.indexContainer; -import de.anomic.index.indexEntry; -import de.anomic.index.indexEntryAttribute; -import de.anomic.index.indexURLEntry; +import de.anomic.index.indexRWIEntry; +import de.anomic.index.indexRWIEntryOld; import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroColumn; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroTree; import de.anomic.server.logging.serverLog; +import de.anomic.yacy.yacySeedDB; public final class plasmaWordIndexAssortment { @@ -89,7 +89,7 @@ public final class plasmaWordIndexAssortment { private kelondroRow bufferStructure(int assortmentCapacity) { kelondroColumn[] structure = new kelondroColumn[3 + assortmentCapacity]; - structure[0] = new kelondroColumn("byte[] wordhash-" + indexEntryAttribute.wordHashLength); + structure[0] = new kelondroColumn("byte[] wordhash-" + yacySeedDB.commonHashLength); structure[1] = new kelondroColumn("Cardinal occ-4 {b256}"); structure[2] = new kelondroColumn("Cardinal time-8 {b256}"); kelondroColumn p = new kelondroColumn("byte[] urlprops-" + payloadrow.objectsize()); @@ -98,7 +98,7 @@ public final class plasmaWordIndexAssortment { } private int assortmentCapacity(int rowsize) { - return (rowsize - indexEntryAttribute.wordHashLength - 12) / payloadrow.objectsize(); + return (rowsize - yacySeedDB.commonHashLength - 12) / payloadrow.objectsize(); } public plasmaWordIndexAssortment(File storagePath, kelondroRow payloadrow, int assortmentLength, int bufferkb, long preloadTime, serverLog log) throws IOException { @@ -133,9 +133,9 @@ public final class plasmaWordIndexAssortment { row.setCol(1, 1); row.setCol(2, newContainer.updated()); Iterator entries = newContainer.entries(); - indexEntry entry; + indexRWIEntry entry; for (int i = 0; i < assortmentLength; i++) { - entry = (indexEntry) entries.next(); + entry = (indexRWIEntry) entries.next(); row.setCol(3 + i, entry.toKelondroEntry().bytes()); } kelondroRow.Entry oldrow = null; @@ -221,7 +221,7 @@ public final class plasmaWordIndexAssortment { indexContainer container = new indexContainer(wordHash, payloadrow); int al = assortmentCapacity(row.objectsize()); for (int i = 0; i < al; i++) { - container.add(new indexEntry[] { new indexURLEntry(row.getColBytes(3 + i)) }, updateTime); + container.add(new indexRWIEntry[] { new indexRWIEntryOld(row.getColBytes(3 + i)) }, updateTime); } return container; } diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java index c26ab24e9..983f1307e 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java @@ -54,7 +54,7 @@ import java.util.Set; import de.anomic.index.indexContainer; import de.anomic.index.indexContainerOrder; -import de.anomic.index.indexEntry; +import de.anomic.index.indexRWIEntry; import de.anomic.index.indexRI; import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroMergeIterator; @@ -168,7 +168,7 @@ public final class plasmaWordIndexAssortmentCluster implements indexRI { c = new indexContainer(newContainer.getWordHash(), payloadrow); for (int k = 0; k < j; k++) { if (i.hasNext()) { - c.add((indexEntry) i.next(), newContainer.updated()); + c.add((indexRWIEntry) i.next(), newContainer.updated()); } else { storeForced(c); return; @@ -178,7 +178,7 @@ public final class plasmaWordIndexAssortmentCluster implements indexRI { } } - public indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) { + public indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) { indexContainer container = new indexContainer(wordHash, payloadrow); container.add(newEntry); return addEntries(container, updateTime, dhtCase); @@ -223,7 +223,7 @@ public final class plasmaWordIndexAssortmentCluster implements indexRI { c = new indexContainer(newContainer.getWordHash(), payloadrow); for (int k = 0; k <= j; k++) { assert (i.hasNext()); - c.add((indexEntry) i.next(), newContainer.updated()); + c.add((indexRWIEntry) i.next(), newContainer.updated()); } try { storeForced(c); @@ -306,9 +306,9 @@ public final class plasmaWordIndexAssortmentCluster implements indexRI { if (buffer != null) { // sort out url hashes that shall be deleted Iterator bi = buffer.entries(); - indexEntry entry; + indexRWIEntry entry; while (bi.hasNext()) { - entry = (indexEntry) bi.next(); + entry = (indexRWIEntry) bi.next(); if (urlHashes.remove(entry.urlHash())) bi.remove(); } record.add(buffer, -1); diff --git a/source/de/anomic/plasma/plasmaWordIndexFile.java b/source/de/anomic/plasma/plasmaWordIndexFile.java index 4eb5785c0..68ed2691f 100644 --- a/source/de/anomic/plasma/plasmaWordIndexFile.java +++ b/source/de/anomic/plasma/plasmaWordIndexFile.java @@ -49,13 +49,13 @@ import java.io.IOException; import java.util.Iterator; import de.anomic.index.indexContainer; -import de.anomic.index.indexEntry; -import de.anomic.index.indexURL; -import de.anomic.index.indexURLEntry; +import de.anomic.index.indexRWIEntry; +import de.anomic.index.indexRWIEntryOld; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroTree; import de.anomic.server.logging.serverLog; +import de.anomic.yacy.yacySeedDB; public final class plasmaWordIndexFile { @@ -91,7 +91,7 @@ public final class plasmaWordIndexFile { long cacheSize = theLocation.length(); if (cacheSize > 1048576) cacheSize = 1048576; return kelondroTree.open(theLocation, cacheSize, 0, - new kelondroRow("byte[] urlhash-" + indexURL.urlHashLength + ", byte[] ba-" + (indexURLEntry.urlEntryRow.objectsize() - indexURL.urlHashLength))); + new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength + ", byte[] ba-" + (indexRWIEntryOld.urlEntryRow.objectsize() - yacySeedDB.commonHashLength))); } public static File wordHash2path(File databaseRoot, String hash) { @@ -128,23 +128,23 @@ public final class plasmaWordIndexFile { } catch (IOException e) {} } - public indexEntry getEntry(String urlhash) throws IOException { + public indexRWIEntry getEntry(String urlhash) throws IOException { kelondroRow.Entry n = theIndex.get(urlhash.getBytes()); if (n == null) return null; - return new indexURLEntry(n.getColString(0, null), n.getColString(1, null)); + return new indexRWIEntryOld(n.getColString(0, null), n.getColString(1, null)); } public boolean contains(String urlhash) throws IOException { return (theIndex.get(urlhash.getBytes()) != null); } - public boolean contains(indexEntry entry) throws IOException { + public boolean contains(indexRWIEntry entry) throws IOException { return (theIndex.get(entry.urlHash().getBytes()) != null); } - public boolean addEntry(indexEntry entry) throws IOException { + public boolean addEntry(indexRWIEntry entry) throws IOException { if (entry == null) return false; - indexEntry oldEntry = getEntry(entry.urlHash()); + indexRWIEntry oldEntry = getEntry(entry.urlHash()); if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this entity return false; } @@ -163,7 +163,7 @@ public final class plasmaWordIndexFile { if (container != null) { Iterator i = container.entries(); while (i.hasNext()) { - if (addEntry((indexEntry) i.next())) count++; + if (addEntry((indexRWIEntry) i.next())) count++; } } @@ -228,7 +228,7 @@ public final class plasmaWordIndexFile { public Object next() { if (i == null) return null; kelondroRow.Entry n = (kelondroRow.Entry) i.next(); - return new indexURLEntry(n.getColString(0, null), n.getColString(1, null)); + return new indexRWIEntryOld(n.getColString(0, null), n.getColString(1, null)); } public void remove() { throw new UnsupportedOperationException(); @@ -248,7 +248,7 @@ public final class plasmaWordIndexFile { long timeout = (time == -1) ? Long.MAX_VALUE : System.currentTimeMillis() + time; try { while ((i.hasNext()) && (System.currentTimeMillis() < timeout)) { - addEntry((indexEntry) i.next()); + addEntry((indexRWIEntry) i.next()); } } catch (kelondroException e) { serverLog.logSevere("PLASMA", "plasmaWordIndexEntity.merge: " + e.getMessage()); diff --git a/source/de/anomic/plasma/plasmaWordIndexFileCluster.java b/source/de/anomic/plasma/plasmaWordIndexFileCluster.java index 961033617..720c01ac4 100644 --- a/source/de/anomic/plasma/plasmaWordIndexFileCluster.java +++ b/source/de/anomic/plasma/plasmaWordIndexFileCluster.java @@ -51,7 +51,7 @@ import java.util.Set; import java.util.TreeSet; import de.anomic.index.indexContainer; -import de.anomic.index.indexEntry; +import de.anomic.index.indexRWIEntry; import de.anomic.index.indexRI; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroRow; @@ -235,10 +235,10 @@ public class plasmaWordIndexFileCluster implements indexRI { if (exists(wordHash)) { plasmaWordIndexFile entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10); indexContainer container = new indexContainer(wordHash, payloadrow); - indexEntry entry; + indexRWIEntry entry; Iterator i = entity.elements(true); while ((i.hasNext()) && (System.currentTimeMillis() < (start + maxTime))) { - entry = (indexEntry) i.next(); + entry = (indexRWIEntry) i.next(); if ((urlselection == null) || (urlselection.contains(entry.urlHash()))) container.add(entry); } return container; @@ -302,7 +302,7 @@ public class plasmaWordIndexFileCluster implements indexRI { } else return 0; } - public indexContainer addEntry(String wordHash, indexEntry newEntry, long updateTime, boolean dhtCase) { + public indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) { indexContainer container = new indexContainer(wordHash, payloadrow); container.add(newEntry); return addEntries(container, updateTime, dhtCase); diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 3bea9a0dd..76b117276 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -55,14 +55,14 @@ import java.util.TreeMap; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpc; import de.anomic.index.indexContainer; -import de.anomic.index.indexEntry; +import de.anomic.index.indexRWIEntry; import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexURL; +import de.anomic.index.indexRWIEntryOld; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.net.URL; import de.anomic.plasma.plasmaCrawlLURL; -import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchTimingProfile; import de.anomic.plasma.plasmaSnippetCache; @@ -491,33 +491,33 @@ public final class yacyClient { //System.out.println("***result count " + results); // create containers - final int words = wordhashes.length() / indexEntryAttribute.wordHashLength; + final int words = wordhashes.length() / yacySeedDB.commonHashLength; indexContainer[] container = new indexContainer[words]; for (int i = 0; i < words; i++) { - container[i] = new indexContainer(wordhashes.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength), indexURLEntry.urlEntryRow); + container[i] = new indexContainer(wordhashes.substring(i * yacySeedDB.commonHashLength, (i + 1) * yacySeedDB.commonHashLength), indexRWIEntryOld.urlEntryRow); } // insert results to containers - plasmaCrawlLURLEntry urlEntry; + indexURLEntry urlEntry; String[] urls = new String[results]; for (int n = 0; n < results; n++) { // get one single search result urlEntry = urlManager.newEntry((String) result.get("resource" + n)); if (urlEntry == null) continue; assert (urlEntry.hash().length() == 12) : "urlEntry.hash() = " + urlEntry.hash(); - plasmaCrawlLURLEntry.Components comp = urlEntry.comp(); + indexURLEntry.Components comp = urlEntry.comp(); if (blacklist.isListed(plasmaURLPattern.BLACKLIST_SEARCH, comp.url())) continue; // block with backlist urlManager.store(urlEntry); urlManager.stack(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2); // save the url entry - final indexEntry entry; + final indexRWIEntry entry; if (urlEntry.word() == null) { // the old way to define words int urlLength = comp.url().toNormalform().length(); int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()).length; - entry = new indexURLEntry( + entry = new indexRWIEntryOld( urlEntry.hash(), urlLength, urlComps, @@ -545,7 +545,7 @@ public final class yacyClient { } // add the url entry to the word indexes for (int m = 0; m < words; m++) { - container[m].add(new indexEntry[]{entry}, System.currentTimeMillis()); + container[m].add(new indexRWIEntry[]{entry}, System.currentTimeMillis()); } // store url hash for statistics urls[n] = urlEntry.hash(); @@ -869,7 +869,7 @@ public final class yacyClient { -er crawlt, Ergebnis erscheint aber unter falschem initiator */ - public static HashMap crawlReceipt(yacySeed targetSeed, String process, String result, String reason, plasmaCrawlLURLEntry entry, String wordhashes) { + public static HashMap crawlReceipt(yacySeed targetSeed, String process, String result, String reason, indexURLEntry entry, String wordhashes) { if (targetSeed == null) { return null; } if (yacyCore.seedDB.mySeed == null) { return null; } if (yacyCore.seedDB.mySeed == targetSeed) { return null; } @@ -943,11 +943,11 @@ public final class yacyClient { // check if we got all necessary urls in the urlCache (only for debugging) Iterator eenum; - indexEntry entry; + indexRWIEntry entry; for (int i = 0; i < indexes.length; i++) { eenum = indexes[i].entries(); while (eenum.hasNext()) { - entry = (indexEntry) eenum.next(); + entry = (indexRWIEntry) eenum.next(); if (urlCache.get(entry.urlHash()) == null) { yacyCore.log.logFine("DEBUG transferIndex: to-send url hash '" + entry.urlHash() + "' is not contained in urlCache"); } @@ -988,9 +988,9 @@ public final class yacyClient { if (uhs.length == 0) { return resultObj; } // all url's known // extract the urlCache from the result - plasmaCrawlLURLEntry[] urls = new plasmaCrawlLURLEntry[uhs.length]; + indexURLEntry[] urls = new indexURLEntry[uhs.length]; for (int i = 0; i < uhs.length; i++) { - urls[i] = (plasmaCrawlLURLEntry) urlCache.get(uhs[i]); + urls[i] = (indexURLEntry) urlCache.get(uhs[i]); if (urls[i] == null) { yacyCore.log.logFine("DEBUG transferIndex: requested url hash '" + uhs[i] + "', unknownURL='" + uhss + "'"); } @@ -1051,11 +1051,11 @@ public final class yacyClient { int indexcount = 0; final StringBuffer entrypost = new StringBuffer(indexes.length*73); Iterator eenum; - indexEntry entry; + indexRWIEntry entry; for (int i = 0; i < indexes.length; i++) { eenum = indexes[i].entries(); while (eenum.hasNext()) { - entry = (indexEntry) eenum.next(); + entry = (indexRWIEntry) eenum.next(); entrypost.append(indexes[i].getWordHash()) .append(entry.toPropertyForm(false)) .append(serverCore.crlfString); @@ -1099,7 +1099,7 @@ public final class yacyClient { } } - private static HashMap transferURL(yacySeed targetSeed, plasmaCrawlLURLEntry[] urls, boolean gzipBody, int timeout) { + private static HashMap transferURL(yacySeed targetSeed, indexURLEntry[] urls, boolean gzipBody, int timeout) { // this post a message to the remote message board final String address = targetSeed.getAddress(); if (address == null) { return null; } diff --git a/source/yacy.java b/source/yacy.java index b648e9e2d..08dcfd6d2 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -71,10 +71,11 @@ import de.anomic.http.httpd; import de.anomic.http.httpdFileHandler; import de.anomic.http.httpdProxyHandler; import de.anomic.index.indexContainer; -import de.anomic.index.indexEntry; +import de.anomic.index.indexRWIEntry; import de.anomic.index.indexEntryAttribute; -import de.anomic.index.indexURL; +import de.anomic.index.indexRWIEntryOld; import de.anomic.index.indexURLEntry; +import de.anomic.index.indexURLEntryOld; import de.anomic.kelondro.kelondroDyn; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMap; @@ -83,8 +84,6 @@ import de.anomic.kelondro.kelondroTree; import de.anomic.net.URL; import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlLURL; -import de.anomic.plasma.plasmaCrawlLURLEntry; -import de.anomic.plasma.plasmaCrawlLURLOldEntry; import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaURLPool; @@ -623,7 +622,7 @@ public final class yacy { kelondroMScoreCluster hs = new kelondroMScoreCluster(); while (ef.hasMoreElements()) { f = (File) ef.nextElement(); - h = f.getName().substring(0, indexURL.urlHashLength); + h = f.getName().substring(0, yacySeedDB.commonHashLength); hs.addScore(h, (int) f.length()); } @@ -740,12 +739,12 @@ public final class yacy { // the combined container will fit, read the container Iterator wordIdxEntries = wordIdxContainer.entries(); - indexEntry iEntry; + indexRWIEntry iEntry; while (wordIdxEntries.hasNext()) { - iEntry = (indexEntry) wordIdxEntries.next(); + iEntry = (indexRWIEntry) wordIdxEntries.next(); String urlHash = iEntry.urlHash(); if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try { - plasmaCrawlLURLEntry urlEntry = currentUrlDB.load(urlHash, null); + indexURLEntry urlEntry = currentUrlDB.load(urlHash, null); urlCounter++; minimizedUrlDB.store(urlEntry); if (urlCounter % 500 == 0) { @@ -965,11 +964,11 @@ public final class yacy { long start = System.currentTimeMillis(); if (source.equals("lurl")) { Iterator eiter = pool.loadedURL.entries(true, false, null); - plasmaCrawlLURLEntry entry; + indexURLEntry entry; while (eiter.hasNext()) { try { - entry = (plasmaCrawlLURLEntry) eiter.next(); - plasmaCrawlLURLEntry.Components comp = entry.comp(); + entry = (indexURLEntry) eiter.next(); + indexURLEntry.Components comp = entry.comp(); if ((entry != null) && (comp.url() != null)) doms.put(comp.url().getHost(), null); } catch (Exception e) { // here a MalformedURLException may occur @@ -1077,10 +1076,10 @@ public final class yacy { if (source.equals("lurl")) { Iterator eiter = pool.loadedURL.entries(true, false, null); - plasmaCrawlLURLEntry entry; + indexURLEntry entry; while (eiter.hasNext()) { - entry = (plasmaCrawlLURLEntry) eiter.next(); - plasmaCrawlLURLEntry.Components comp = entry.comp(); + entry = (indexURLEntry) eiter.next(); + indexURLEntry.Components comp = entry.comp(); if ((entry != null) && (comp.url() != null)) { if (html) { bos.write(("" + comp.descr() + "
").getBytes("UTF-8")); @@ -1135,7 +1134,7 @@ public final class yacy { plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, true, 1000, true, 1000, true, 10000); kelondroTree oldindex = null; try { - oldindex = new kelondroTree(urlHash, 1000, -1, plasmaCrawlLURLOldEntry.rowdef); + oldindex = new kelondroTree(urlHash, 1000, -1, indexURLEntryOld.rowdef); } catch (IOException e) { System.out.println("ERROR: CANNOT OPEN OLD INDEX: " + e.getMessage()); } @@ -1145,9 +1144,9 @@ public final class yacy { int tc = oldindex.size(), c = 0; Iterator eiter = oldindex.contentRows(-1); kelondroRow.Entry oldrow; - plasmaCrawlLURLEntry oldentry; - plasmaCrawlLURLEntry newentry; - plasmaCrawlLURLEntry.Components comp; + indexURLEntry oldentry; + indexURLEntry newentry; + indexURLEntry.Components comp; byte[] dummymd5 = new byte[0]; while (eiter.hasNext()) { try { @@ -1158,7 +1157,7 @@ public final class yacy { oldrow = null; } if (oldrow != null) try { - oldentry = new plasmaCrawlLURLOldEntry(oldrow, null); + oldentry = new indexURLEntryOld(oldrow, null); comp = oldentry.comp(); newentry = pool.loadedURL.newEntry( comp.url(), @@ -1236,7 +1235,7 @@ public final class yacy { WordIndex = new plasmaWordIndex(homeDBroot, indexRoot, true, 8*1024*1024, 3000, log, sps.getConfigBool("useCollectionIndex", false)); indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false); } else if (resource.equals("assortments")) { - plasmaWordIndexAssortmentCluster assortmentCluster = new plasmaWordIndexAssortmentCluster(new File(homeDBroot, "ACLUSTER"), 64, indexURLEntry.urlEntryRow, 16*1024*1024, 3000, log); + plasmaWordIndexAssortmentCluster assortmentCluster = new plasmaWordIndexAssortmentCluster(new File(homeDBroot, "ACLUSTER"), 64, indexRWIEntryOld.urlEntryRow, 16*1024*1024, 3000, log); indexContainerIterator = assortmentCluster.wordContainers(wordChunkStartHash, true, false); } /*else if (resource.startsWith("assortment")) { int a = Integer.parseInt(resource.substring(10));