diff --git a/build.properties b/build.properties index ff751da1f..67e12825d 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.5 javacTarget=1.5 # Release Configuration -releaseVersion=0.91 +releaseVersion=0.92 stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz sourceReleaseFile=yacy_src_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz releaseFileParentDir=yacy diff --git a/defaults/yacy.init b/defaults/yacy.init index 88acfd11d..66dc80463 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -914,7 +914,18 @@ content.phpbb3.tableprefix = phpbb_ content.phpbb3.dbuser = notroot content.phpbb3.dbpw = joshua content.phpbb3.ppf = 1000 -content.phpbb3.dumpfile = +content.phpbb3.dumpfile = + +# segment assignment for index storage processes in YaCy: +# each process can store its index result in it's own index segment +segment.process.receipts_tmp = default +segment.process.queries_tmp = default +segment.process.dhtin_tmp = default +segment.process.dhtout_tmp = default +segment.process.proxy_tmp = default +segment.process.localcrawling_tmp = default +segment.process.remotecrawling_tmp = default +segment.process.default_tmp = default # search engine teaser: an about box in search results # this is only shown, if the about.body is filled diff --git a/htroot/BlacklistCleaner_p.java b/htroot/BlacklistCleaner_p.java index c5e4fb8bb..b68c6860a 100644 --- a/htroot/BlacklistCleaner_p.java +++ b/htroot/BlacklistCleaner_p.java @@ -75,7 +75,7 @@ public class BlacklistCleaner_p { listManager.listsPath = new File(env.getRootPath(), env.getConfig("listManager.listsPath", "DATA/LISTS")); String blacklistToUse = null; - // getting the list of supported blacklist types + // get the list of supported blacklist types final String supportedBlacklistTypesStr = AbstractBlacklist.BLACKLIST_TYPES_STRING; final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(","); @@ -290,7 +290,7 @@ public class BlacklistCleaner_p { if (list != null){ - // getting rid of escape characters which make it impossible to + // get rid of escape characters which make it impossible to // properly use contains() if (s.contains("\\\\")) { s = s.replaceAll(Pattern.quote("\\\\"), Matcher.quoteReplacement("\\")); diff --git a/htroot/Blacklist_p.java b/htroot/Blacklist_p.java index 179e6b42d..b0ed459bd 100644 --- a/htroot/Blacklist_p.java +++ b/htroot/Blacklist_p.java @@ -64,11 +64,11 @@ public class Blacklist_p { listManager.switchboard = (Switchboard) env; listManager.listsPath = new File(listManager.switchboard.getRootPath(),listManager.switchboard.getConfig("listManager.listsPath", "DATA/LISTS")); - // getting the list of supported blacklist types + // get the list of supported blacklist types final String supportedBlacklistTypesStr = AbstractBlacklist.BLACKLIST_TYPES_STRING; final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(","); - // loading all blacklist files located in the directory + // load all blacklist files located in the directory List dirlist = listManager.getDirListing(listManager.listsPath, BLACKLIST_FILENAME_FILTER); String blacklistToUse = null; diff --git a/htroot/BlogComments.java b/htroot/BlogComments.java index 2940b177a..bb423ce43 100644 --- a/htroot/BlogComments.java +++ b/htroot/BlogComments.java @@ -328,14 +328,14 @@ public class BlogComments { try { if (!Boolean.valueOf(sb.getConfig("msgForwardingEnabled","false")).booleanValue()) return; - // getting the recipient address + // get the recipient address final String sendMailTo = sb.getConfig("msgForwardingTo","root@localhost").trim(); - // getting the sendmail configuration + // get the sendmail configuration final String sendMailStr = sb.getConfig("msgForwardingCmd","/usr/bin/sendmail")+" "+sendMailTo; final String[] sendMail = sendMailStr.trim().split(" "); - // building the message text + // build the message text final StringBuilder emailTxt = new StringBuilder(); emailTxt.append("To: ") .append(sendMailTo) diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index b6b93f6e4..f40248552 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -43,6 +43,7 @@ import de.anomic.data.userDB; import de.anomic.data.bookmarksDB.Tag; import de.anomic.document.Document; import de.anomic.http.metadata.RequestHeader; +import de.anomic.kelondro.text.Segments; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.DateFormatter; import de.anomic.search.Switchboard; @@ -183,7 +184,7 @@ public class Bookmarks { final bookmarksDB.Bookmark bookmark = sb.bookmarksDB.getBookmark(urlHash); if (bookmark == null) { // try to get the bookmark from the LURL database - final URLMetadataRow urlentry = sb.indexSegment.urlMetadata().load(urlHash, null, 0); + final URLMetadataRow urlentry = sb.indexSegments.urlMetadata(Segments.Process.PUBLIC).load(urlHash, null, 0); Document document = null; if (urlentry != null) { final URLMetadataRow.Components metadata = urlentry.metadata(); diff --git a/htroot/Connections_p.java b/htroot/Connections_p.java index 8cd043d48..7e89b501c 100644 --- a/htroot/Connections_p.java +++ b/htroot/Connections_p.java @@ -88,19 +88,19 @@ public final class Connections_p { for ( int currentThreadIdx = 0; currentThreadIdx < count; currentThreadIdx++ ) { final Thread t = threadList[currentThreadIdx]; if ((t != null) && (t instanceof serverCore.Session) && (t.isAlive())) { - // getting the session object + // get the session object final Session s = ((Session) t); - // getting the session runtime + // get the session runtime final long sessionTime = s.getTime(); - // getting the request command line + // getthe request command line boolean blockingRequest = false; String commandLine = s.getCommandLine(); if (commandLine == null) blockingRequest = true; final int commandCount = s.getCommandCount(); - // getting the source ip address and port + // get the source ip address and port final InetAddress userAddress = s.getUserAddress(); final int userPort = s.getUserPort(); if (userAddress == null) continue; @@ -113,13 +113,13 @@ public final class Connections_p { if (cmdObj instanceof HTTPDemon) { prot = isSSL ? "https":"http"; - // getting the http command object + // get the http command object final HTTPDemon currentHttpd = (HTTPDemon)cmdObj; - // getting the connection properties of this session + // get the connection properties of this session final Properties conProp = (Properties) currentHttpd.getConProp().clone(); - // getting the destination host + // get the destination host dest = conProp.getProperty(HeaderFramework.CONNECTION_PROP_HOST); if (dest==null)continue; } diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java index e0fdb34ab..cff8614f8 100644 --- a/htroot/CrawlResults.java +++ b/htroot/CrawlResults.java @@ -31,6 +31,7 @@ import java.util.Iterator; import java.util.Locale; import de.anomic.http.metadata.RequestHeader; +import de.anomic.kelondro.text.Segments; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; @@ -104,7 +105,7 @@ public class CrawlResults { final String hash = post.get("hash", null); if (hash != null) { // delete from database - sb.indexSegment.urlMetadata().remove(hash); + sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).remove(hash); } } @@ -114,7 +115,7 @@ public class CrawlResults { if (hashpart != null) { // delete all urls for this domain from database try { - sb.indexSegment.urlMetadata().deleteDomain(hashpart); + sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).deleteDomain(hashpart); sb.crawlResults.deleteDomain(tabletype, domain, hashpart); } catch (IOException e) { e.printStackTrace(); @@ -178,7 +179,7 @@ public class CrawlResults { executorHash = sb.crawlResults.getExecutorHash(tabletype, i); urlHash = sb.crawlResults.getUrlHash(tabletype, i); try { - urle = sb.indexSegment.urlMetadata().load(urlHash, null, 0); + urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(urlHash, null, 0); if(urle == null) { Log.logWarning("PLASMA", "CrawlResults: URL not in index for crawl result "+ i +" with hash "+ urlHash); urlstr = null; diff --git a/htroot/IndexCleaner_p.html b/htroot/IndexCleaner_p.html index 727a81e74..86eb96114 100755 --- a/htroot/IndexCleaner_p.html +++ b/htroot/IndexCleaner_p.html @@ -40,6 +40,11 @@ #(/rwidb)#

URL-DB-Cleaner - Clean up the database by deletion of blacklisted urls:
+ Start/Resume Stop Pause diff --git a/htroot/IndexCleaner_p.java b/htroot/IndexCleaner_p.java index 1055091a8..80dcbd464 100755 --- a/htroot/IndexCleaner_p.java +++ b/htroot/IndexCleaner_p.java @@ -27,6 +27,7 @@ import de.anomic.http.metadata.RequestHeader; import de.anomic.kelondro.text.MetadataRepository; import de.anomic.kelondro.text.Segment; +import de.anomic.kelondro.text.Segments; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -39,11 +40,23 @@ public class IndexCleaner_p { final serverObjects prop = new serverObjects(); final Switchboard sb = (Switchboard) env; prop.put("title", "DbCleanup_p"); + + // get segment + Segment indexSegment = null; + if (post != null && post.containsKey("segment")) { + String segmentName = post.get("segment"); + if (sb.indexSegments.segmentExist(segmentName)) { + indexSegment = sb.indexSegments.segment(segmentName); + } + } else { + // take default segment + indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC); + } + if (post!=null) { - //prop.putHTML("bla", "post!=null"); if (post.get("action").equals("ustart")) { if (urldbCleanerThread==null || !urldbCleanerThread.isAlive()) { - urldbCleanerThread = sb.indexSegment.urlMetadata().getBlacklistCleaner(Switchboard.urlBlacklist); + urldbCleanerThread = indexSegment.urlMetadata().getBlacklistCleaner(Switchboard.urlBlacklist); urldbCleanerThread.start(); } else { @@ -58,7 +71,7 @@ public class IndexCleaner_p { } else if (post.get("action").equals("rstart")) { if (indexCleanerThread==null || !indexCleanerThread.isAlive()) { - indexCleanerThread = sb.indexSegment.getReferenceCleaner(post.get("wordHash","AAAAAAAAAAAA").getBytes()); + indexCleanerThread = indexSegment.getReferenceCleaner(post.get("wordHash","AAAAAAAAAAAA").getBytes()); indexCleanerThread.start(); } else { @@ -74,10 +87,9 @@ public class IndexCleaner_p { prop.put("LOCATION",""); return prop; } - //prop.put("bla", "post==null"); if (urldbCleanerThread!=null) { prop.put("urldb", "1"); - prop.putNum("urldb_percentUrls", ((double)urldbCleanerThread.totalSearchedUrls/sb.indexSegment.urlMetadata().size())*100); + prop.putNum("urldb_percentUrls", ((double)urldbCleanerThread.totalSearchedUrls/indexSegment.urlMetadata().size())*100); prop.putNum("urldb_blacklisted", urldbCleanerThread.blacklistedUrls); prop.putNum("urldb_total", urldbCleanerThread.totalSearchedUrls); prop.putHTML("urldb_lastBlacklistedUrl", urldbCleanerThread.lastBlacklistedUrl); @@ -94,7 +106,7 @@ public class IndexCleaner_p { prop.put("rwidb_threadAlive", indexCleanerThread.isAlive() + ""); prop.put("rwidb_threadToString", indexCleanerThread.toString()); prop.putNum("rwidb_RWIcountstart", indexCleanerThread.rwiCountAtStart); - prop.putNum("rwidb_RWIcountnow", sb.indexSegment.termIndex().sizesMax()); + prop.putNum("rwidb_RWIcountnow", indexCleanerThread.rwisize()); prop.put("rwidb_wordHashNow", (indexCleanerThread.wordHashNow == null) ? "NULL" : new String(indexCleanerThread.wordHashNow)); prop.put("rwidb_lastWordHash", (indexCleanerThread.lastWordHash == null) ? "null" : new String(indexCleanerThread.lastWordHash)); prop.putNum("rwidb_lastDeletionCounter", indexCleanerThread.lastDeletionCounter); diff --git a/htroot/IndexControlRWIs_p.html b/htroot/IndexControlRWIs_p.html index 66159314b..b1ac84907 100644 --- a/htroot/IndexControlRWIs_p.html +++ b/htroot/IndexControlRWIs_p.html @@ -12,6 +12,15 @@

RWI Retrieval (= search for a single word)
+
Select Segment:
+
+ +
+
Retrieve by Word:
diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index b2ef13c7e..ccf69282e 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -55,6 +55,7 @@ import de.anomic.search.QueryParams; import de.anomic.search.RankingProcess; import de.anomic.search.SearchEventCache; import de.anomic.search.Switchboard; +import de.anomic.search.SwitchboardConstants; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyClient; @@ -69,10 +70,20 @@ public class IndexControlRWIs_p { final Switchboard sb = (Switchboard) env; final serverObjects prop = new serverObjects(); + // set default values prop.putHTML("keystring", ""); prop.put("keyhash", ""); prop.put("result", ""); - + String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default"); + int i = 0; + for (String s: sb.indexSegments.segmentNames()) { + prop.put("segments_" + i + "_name", s); + prop.put("segments_" + i + "_selected", (segmentName.equals(s)) ? 1 : 0); + i++; + } + Segment segment = sb.indexSegments.segment(segmentName); + prop.put("segments", i); + // switch off all optional forms/lists prop.put("searchresult", 0); prop.put("keyhashsimilar", 0); @@ -83,6 +94,16 @@ public class IndexControlRWIs_p { if (post != null) { // default values + segmentName = post.get("segment", segmentName).trim(); + i= 0; + for (String s: sb.indexSegments.segmentNames()) { + prop.put("segments_" + i + "_name", s); + prop.put("segments_" + i + "_selected", (segmentName.equals(s)) ? 1 : 0); + i++; + } + prop.put("segments", i); + segment = sb.indexSegments.segment(segmentName); + final String keystring = post.get("keystring", "").trim(); byte[] keyhash = post.get("keyhash", "").trim().getBytes(); prop.putHTML("keystring", keystring); @@ -96,7 +117,7 @@ public class IndexControlRWIs_p { if (post.containsKey("keystringsearch")) { keyhash = Word.word2hash(keystring); prop.put("keyhash", keyhash); - final RankingProcess ranking = genSearchresult(prop, sb, keyhash, null); + final RankingProcess ranking = genSearchresult(prop, sb, segment, keyhash, null); if (ranking.filteredCount() == 0) { prop.put("searchresult", 1); prop.putHTML("searchresult_word", keystring); @@ -107,7 +128,7 @@ public class IndexControlRWIs_p { if (keystring.length() == 0 || !new String(Word.word2hash(keystring)).equals(new String(keyhash))) { prop.put("keystring", "<not possible to compute word from hash>"); } - final RankingProcess ranking = genSearchresult(prop, sb, keyhash, null); + final RankingProcess ranking = genSearchresult(prop, sb, segment, keyhash, null); if (ranking.filteredCount() == 0) { prop.put("searchresult", 2); prop.putHTML("searchresult_wordhash", new String(keyhash)); @@ -116,7 +137,7 @@ public class IndexControlRWIs_p { // delete everything if (post.containsKey("deletecomplete") && post.containsKey("confirmDelete")) { - sb.indexSegment.clear(); + segment.clear(); sb.crawlQueues.clear(); sb.crawlStacker.clear(); try { @@ -132,9 +153,9 @@ public class IndexControlRWIs_p { if (delurl || delurlref) { // generate an urlx array ReferenceContainer index = null; - index = sb.indexSegment.termIndex().get(keyhash, null); + index = segment.termIndex().get(keyhash, null); final Iterator en = index.entries(); - int i = 0; + i = 0; urlx = new String[index.size()]; while (en.hasNext()) { urlx[i++] = en.next().metadataHash(); @@ -142,14 +163,14 @@ public class IndexControlRWIs_p { index = null; } if (delurlref) { - for (int i = 0; i < urlx.length; i++) sb.removeAllUrlReferences(urlx[i], true); + for (i = 0; i < urlx.length; i++) sb.removeAllUrlReferences(segment, urlx[i], true); } if (delurl || delurlref) { - for (int i = 0; i < urlx.length; i++) { - sb.urlRemove(urlx[i]); + for (i = 0; i < urlx.length; i++) { + sb.urlRemove(segment, urlx[i]); } } - sb.indexSegment.termIndex().delete(keyhash); + segment.termIndex().delete(keyhash); post.remove("keyhashdeleteall"); post.put("urllist", "generated"); } catch (IOException e) { @@ -159,16 +180,16 @@ public class IndexControlRWIs_p { // delete selected URLs if (post.containsKey("keyhashdelete")) try { if (delurlref) { - for (int i = 0; i < urlx.length; i++) sb.removeAllUrlReferences(urlx[i], true); + for (i = 0; i < urlx.length; i++) sb.removeAllUrlReferences(segment, urlx[i], true); } if (delurl || delurlref) { - for (int i = 0; i < urlx.length; i++) { - sb.urlRemove(urlx[i]); + for (i = 0; i < urlx.length; i++) { + sb.urlRemove(segment, urlx[i]); } } final Set urlHashes = new HashSet(); - for (int i = 0; i < urlx.length; i++) urlHashes.add(urlx[i]); - sb.indexSegment.termIndex().remove(keyhash, urlHashes); + for (i = 0; i < urlx.length; i++) urlHashes.add(urlx[i]); + segment.termIndex().remove(keyhash, urlHashes); // this shall lead to a presentation of the list; so handle that the remaining program // thinks that it was called for a list presentation post.remove("keyhashdelete"); @@ -183,7 +204,7 @@ public class IndexControlRWIs_p { } final Bitfield flags = compileFlags(post); final int count = (post.get("lines", "all").equals("all")) ? -1 : post.getInt("lines", -1); - final RankingProcess ranking = genSearchresult(prop, sb, keyhash, flags); + final RankingProcess ranking = genSearchresult(prop, sb, segment, keyhash, flags); genURLList(prop, keyhash, keystring, ranking, flags, count); } @@ -212,7 +233,7 @@ public class IndexControlRWIs_p { // prepare index ReferenceContainer index; final long starttime = System.currentTimeMillis(); - index = sb.indexSegment.termIndex().get(keyhash, null); + index = segment.termIndex().get(keyhash, null); // built urlCache final Iterator urlIter = index.entries(); final HashMap knownURLs = new HashMap(); @@ -221,7 +242,7 @@ public class IndexControlRWIs_p { URLMetadataRow lurl; while (urlIter.hasNext()) { iEntry = urlIter.next(); - lurl = sb.indexSegment.urlMetadata().load(iEntry.metadataHash(), null, 0); + lurl = segment.urlMetadata().load(iEntry.metadataHash(), null, 0); if (lurl == null) { unknownURLEntries.add(iEntry.metadataHash()); urlIter.remove(); @@ -251,9 +272,9 @@ public class IndexControlRWIs_p { // generate list if (post.containsKey("keyhashsimilar")) try { - final Iterator> containerIt = sb.indexSegment.termIndex().references(keyhash, true, 256, false).iterator(); + final Iterator> containerIt = segment.termIndex().references(keyhash, true, 256, false).iterator(); ReferenceContainer container; - int i = 0; + i = 0; int rows = 0, cols = 0; prop.put("keyhashsimilar", "1"); while (containerIt.hasNext() && i < 256) { @@ -283,10 +304,10 @@ public class IndexControlRWIs_p { final String[] supportedBlacklistTypes = env.getConfig("BlackLists.types", "").split(","); pw = new PrintWriter(new FileWriter(new File(listManager.listsPath, blacklist), true)); yacyURL url; - for (int i=0; i
URL Retrieval
+
Select Segment:
+
+ +
+
Retrieve by URL:
diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 5d8ec1a20..500edac50 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -34,9 +34,11 @@ import de.anomic.http.metadata.RequestHeader; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.RotateIterator; import de.anomic.kelondro.text.MetadataRepository; +import de.anomic.kelondro.text.Segment; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.DateFormatter; import de.anomic.search.Switchboard; +import de.anomic.search.SwitchboardConstants; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacySeedDB; @@ -49,10 +51,21 @@ public class IndexControlURLs_p { final Switchboard sb = (Switchboard) env; final serverObjects prop = new serverObjects(); + + // set default values prop.put("urlstring", ""); prop.put("urlhash", ""); prop.put("result", ""); - prop.put("ucount", Integer.toString(sb.indexSegment.urlMetadata().size())); + String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default"); + int i = 0; + for (String s: sb.indexSegments.segmentNames()) { + prop.put("segments_" + i + "_name", s); + prop.put("segments_" + i + "_selected", (segmentName.equals(s)) ? 1 : 0); + i++; + } + Segment segment = sb.indexSegments.segment(segmentName); + prop.put("segments", i); + prop.put("ucount", Integer.toString(segment.urlMetadata().size())); prop.put("otherHosts", ""); prop.put("genUrlProfile", 0); prop.put("statistics", 1); @@ -60,8 +73,22 @@ public class IndexControlURLs_p { prop.put("statisticslines", 0); prop.put("reload", 0); + // do segment selection + if (post != null && post.containsKey("segment")) { + // default values + segmentName = post.get("segment", segmentName).trim(); + i= 0; + for (String s: sb.indexSegments.segmentNames()) { + prop.put("segments_" + i + "_name", s); + prop.put("segments_" + i + "_selected", (segmentName.equals(s)) ? 1 : 0); + i++; + } + prop.put("segments", i); + segment = sb.indexSegments.segment(segmentName); + } + // show export messages - final MetadataRepository.Export export = sb.indexSegment.urlMetadata().export(); + final MetadataRepository.Export export = segment.urlMetadata().export(); if ((export != null) && (export.isAlive())) { // there is currently a running export prop.put("lurlexport", 2); @@ -108,20 +135,20 @@ public class IndexControlURLs_p { prop.put("result", " "); if (post.containsKey("urlhashdeleteall")) { - final int i = sb.removeAllUrlReferences(urlhash, true); + i = sb.removeAllUrlReferences(segment, urlhash, true); prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes."); prop.put("lurlexport", 0); prop.put("reload", 0); } if (post.containsKey("urlhashdelete")) { - final URLMetadataRow entry = sb.indexSegment.urlMetadata().load(urlhash, null, 0); + final URLMetadataRow entry = segment.urlMetadata().load(urlhash, null, 0); if (entry == null) { prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); } else { urlstring = entry.metadata().url().toNormalform(false, true); prop.put("urlstring", ""); - sb.urlRemove(urlhash); + sb.urlRemove(segment, urlhash); prop.putHTML("result", "Removed URL " + urlstring); } prop.put("lurlexport", 0); @@ -137,7 +164,7 @@ public class IndexControlURLs_p { if ((urlhash == null) || (urlstring == null)) { prop.put("result", "No input given; nothing deleted."); } else { - sb.urlRemove(urlhash); + sb.urlRemove(segment, urlhash); prop.putHTML("result", "Removed URL " + urlstring); } prop.put("lurlexport", 0); @@ -149,12 +176,12 @@ public class IndexControlURLs_p { final yacyURL url = new yacyURL(urlstring, null); urlhash = url.hash(); prop.put("urlhash", urlhash); - final URLMetadataRow entry = sb.indexSegment.urlMetadata().load(urlhash, null, 0); + final URLMetadataRow entry = segment.urlMetadata().load(urlhash, null, 0); if (entry == null) { prop.putHTML("urlstring", "unknown url: " + urlstring); prop.put("urlhash", ""); } else { - prop.putAll(genUrlProfile(sb, entry, urlhash)); + prop.putAll(genUrlProfile(segment, entry, urlhash)); prop.put("statistics", 0); } } catch (final MalformedURLException e) { @@ -166,12 +193,12 @@ public class IndexControlURLs_p { } if (post.containsKey("urlhashsearch")) { - final URLMetadataRow entry = sb.indexSegment.urlMetadata().load(urlhash, null, 0); + final URLMetadataRow entry = segment.urlMetadata().load(urlhash, null, 0); if (entry == null) { prop.putHTML("result", "No Entry for URL hash " + urlhash); } else { prop.putHTML("urlstring", entry.metadata().url().toNormalform(false, true)); - prop.putAll(genUrlProfile(sb, entry, urlhash)); + prop.putAll(genUrlProfile(segment, entry, urlhash)); prop.put("statistics", 0); } prop.put("lurlexport", 0); @@ -181,10 +208,10 @@ public class IndexControlURLs_p { // generate list if (post.containsKey("urlhashsimilar")) { try { - final Iterator entryIt = new RotateIterator(sb.indexSegment.urlMetadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), sb.indexSegment.termIndex().sizesMax()); + final Iterator entryIt = new RotateIterator(segment.urlMetadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), segment.termIndex().sizesMax()); final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:
"); URLMetadataRow entry; - int i = 0; + i = 0; int rows = 0, cols = 0; prop.put("urlhashsimilar", "1"); while (entryIt.hasNext() && i < 256) { @@ -228,7 +255,7 @@ public class IndexControlURLs_p { final File f = new File(s); f.getParentFile().mkdirs(); final String filter = post.get("exportfilter", ".*"); - final MetadataRepository.Export running = sb.indexSegment.urlMetadata().export(f, filter, null, format, dom); + final MetadataRepository.Export running = segment.urlMetadata().export(f, filter, null, format, dom); prop.put("lurlexport_exportfile", s); prop.put("lurlexport_urlcount", running.count()); @@ -241,7 +268,7 @@ public class IndexControlURLs_p { if (post.containsKey("deletedomain")) { String hp = post.get("hashpart"); try { - sb.indexSegment.urlMetadata().deleteDomain(hp); + segment.urlMetadata().deleteDomain(hp); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); @@ -257,7 +284,7 @@ public class IndexControlURLs_p { prop.put("statistics_lines", count); int cnt = 0; try { - statsiter = sb.indexSegment.urlMetadata().statistics(count); + statsiter = segment.urlMetadata().statistics(count); boolean dark = true; MetadataRepository.hostStat hs; while (statsiter.hasNext() && cnt < count) { @@ -280,12 +307,12 @@ public class IndexControlURLs_p { } // insert constants - prop.putNum("ucount", sb.indexSegment.urlMetadata().size()); + prop.putNum("ucount", segment.urlMetadata().size()); // return rewrite properties return prop; } - private static serverObjects genUrlProfile(final Switchboard switchboard, final URLMetadataRow entry, final String urlhash) { + private static serverObjects genUrlProfile(final Segment segment, final URLMetadataRow entry, final String urlhash) { final serverObjects prop = new serverObjects(); if (entry == null) { prop.put("genUrlProfile", "1"); @@ -293,7 +320,7 @@ public class IndexControlURLs_p { return prop; } final URLMetadataRow.Components metadata = entry.metadata(); - final URLMetadataRow le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : switchboard.indexSegment.urlMetadata().load(entry.referrerHash(), null, 0); + final URLMetadataRow le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : segment.urlMetadata().load(entry.referrerHash(), null, 0); if (metadata.url() == null) { prop.put("genUrlProfile", "1"); prop.put("genUrlProfile_urlhash", urlhash); diff --git a/htroot/IndexImport_p.java b/htroot/IndexImport_p.java index 32dddb5ee..5cef87f3f 100644 --- a/htroot/IndexImport_p.java +++ b/htroot/IndexImport_p.java @@ -35,6 +35,8 @@ import java.util.Date; import de.anomic.crawler.Importer; import de.anomic.crawler.NoticeURLImporter; import de.anomic.http.metadata.RequestHeader; +import de.anomic.kelondro.text.Segment; +import de.anomic.kelondro.text.Segments; import de.anomic.kelondro.util.ByteBuffer; import de.anomic.kelondro.util.DateFormatter; import de.anomic.search.Switchboard; @@ -45,10 +47,22 @@ public final class IndexImport_p { public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { // return variable that accumulates replacements - final Switchboard switchboard = (Switchboard) env; + final Switchboard sb = (Switchboard) env; final serverObjects prop = new serverObjects(); int activeCount = 0; + + // get segment + Segment indexSegment = null; + if (post != null && post.containsKey("segment")) { + String segmentName = post.get("segment"); + if (sb.indexSegments.segmentExist(segmentName)) { + indexSegment = sb.indexSegments.segment(segmentName); + } + } else { + // take default segment + indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC); + } if (post != null) { if (post.containsKey("startIndexDbImport")) { @@ -56,13 +70,13 @@ public final class IndexImport_p { final boolean startImport = true; if (startImport) { final Importer importerThread = new NoticeURLImporter( - switchboard.queuesRoot, - switchboard.crawlQueues, - switchboard.crawler.profilesActiveCrawls, - switchboard.dbImportManager); + sb.queuesRoot, + sb.crawlQueues, + sb.crawler.profilesActiveCrawls, + sb.dbImportManager); if (importerThread != null) { - importerThread.setJobID(switchboard.dbImportManager.generateUniqueJobID()); + importerThread.setJobID(sb.dbImportManager.generateUniqueJobID()); importerThread.startIt(); } prop.put("LOCATION",""); @@ -80,7 +94,7 @@ public final class IndexImport_p { errorOut.close(); } } else if (post.containsKey("clearFinishedJobList")) { - switchboard.dbImportManager.finishedJobs.clear(); + sb.dbImportManager.finishedJobs.clear(); prop.put("LOCATION", ""); return prop; } else if ( @@ -88,9 +102,9 @@ public final class IndexImport_p { (post.containsKey("pauseIndexDbImport")) || (post.containsKey("continueIndexDbImport")) ) { - // getting the job nr of the thread + // get the job nr of the thread final String jobID = post.get("jobNr"); - final Importer importer = switchboard.dbImportManager.getImporterByID(Integer.valueOf(jobID).intValue()); + final Importer importer = sb.dbImportManager.getImporterByID(Integer.valueOf(jobID).intValue()); if (importer != null) { if (post.containsKey("stopIndexDbImport")) { try { @@ -110,13 +124,13 @@ public final class IndexImport_p { } } - prop.putNum("wcount", switchboard.indexSegment.termIndex().sizesMax()); - prop.putNum("ucount", switchboard.indexSegment.urlMetadata().size()); + prop.putNum("wcount", indexSegment.termIndex().sizesMax()); + prop.putNum("ucount", indexSegment.urlMetadata().size()); /* * Loop over all currently running jobs */ - final Importer[] importThreads = switchboard.dbImportManager.getRunningImporter(); + final Importer[] importThreads = sb.dbImportManager.getRunningImporter(); activeCount = importThreads.length; for (int i=0; i < activeCount; i++) { @@ -154,7 +168,7 @@ public final class IndexImport_p { /* * Loop over all finished jobs */ - final Importer[] finishedJobs = switchboard.dbImportManager.getFinishedImporter(); + final Importer[] finishedJobs = sb.dbImportManager.getFinishedImporter(); for (int i=0; i defaultSettings = ((post == null) || (!(post.containsKey("submitdefault")))) ? null : FileUtils.loadMap(defaultSettingsFile); - Iterator threads = switchboard.threadNames(); + Iterator threads = sb.threadNames(); String threadName; serverBusyThread thread; @@ -88,7 +103,7 @@ public class PerformanceQueues_p { long blocktime_total = 0, sleeptime_total = 0, exectime_total = 0; while (threads.hasNext()) { threadName = threads.next(); - thread = switchboard.getThread(threadName); + thread = sb.getThread(threadName); blocktime_total += thread.getBlockTime(); sleeptime_total += thread.getSleepTime(); exectime_total += thread.getExecTime(); @@ -101,7 +116,7 @@ public class PerformanceQueues_p { long blocktime, sleeptime, exectime; long idlesleep, busysleep, memuse, memprereq; int queuesize; - threads = switchboard.threadNames(); + threads = sb.threadNames(); int c = 0; long idleCycles, busyCycles, memshortageCycles; // set profile? @@ -110,13 +125,13 @@ public class PerformanceQueues_p { final boolean setDelay = (post != null) && (post.containsKey("submitdelay")); // save used settings file to config if (setProfile && post != null){ - switchboard.setConfig("performanceProfile", post.get("defaultFile", "defaults/yacy.init")); - switchboard.setConfig("performanceSpeed", post.getInt("profileSpeed", 100)); + sb.setConfig("performanceProfile", post.get("defaultFile", "defaults/yacy.init")); + sb.setConfig("performanceSpeed", post.getInt("profileSpeed", 100)); } while (threads.hasNext()) { threadName = threads.next(); - thread = switchboard.getThread(threadName); + thread = sb.getThread(threadName); // set values to templates prop.put("table_" + c + "_threadname", threadName); @@ -154,21 +169,21 @@ public class PerformanceQueues_p { prop.putNum("table_" + c + "_memusepercycle", (busyCycles == 0) ? -1 : memuse / busyCycles / 1024); // load with old values - idlesleep = switchboard.getConfigLong(threadName + "_idlesleep" , 1000); - busysleep = switchboard.getConfigLong(threadName + "_busysleep", 100); - memprereq = switchboard.getConfigLong(threadName + "_memprereq", 0); + idlesleep = sb.getConfigLong(threadName + "_idlesleep" , 1000); + busysleep = sb.getConfigLong(threadName + "_busysleep", 100); + memprereq = sb.getConfigLong(threadName + "_memprereq", 0); if (setDelay && post != null) { // load with new values idlesleep = post.getLong(threadName + "_idlesleep", idlesleep); busysleep = post.getLong(threadName + "_busysleep", busysleep); memprereq = post.getLong(threadName + "_memprereq", memprereq) * 1024; - if (memprereq == 0) memprereq = switchboard.getConfigLong(threadName + "_memprereq", 0); + if (memprereq == 0) memprereq = sb.getConfigLong(threadName + "_memprereq", 0); // check values to prevent short-cut loops if (idlesleep < 1000) idlesleep = 1000; if (threadName.equals("10_httpd")) { idlesleep = 0; busysleep = 0; memprereq = 0; } - onTheFlyReconfiguration(switchboard, threadName, idlesleep, busysleep, memprereq); + onTheFlyReconfiguration(sb, threadName, idlesleep, busysleep, memprereq); } if (setProfile) { if (threadName.equals(SwitchboardConstants.PEER_PING) || threadName.equals(SwitchboardConstants.SEED_UPLOAD) @@ -177,7 +192,7 @@ public class PerformanceQueues_p { ) { /* do not change any values */ } else if (threadName.equals(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER) || threadName.equals(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL)) { - switchboard.setRemotecrawlPPM(Math.max(1, (int) (switchboard.getConfigLong("network.unit.remotecrawl.speed", 60) / multiplier))); + sb.setRemotecrawlPPM(Math.max(1, (int) (sb.getConfigLong("network.unit.remotecrawl.speed", 60) / multiplier))); } else { // load with new values @@ -190,7 +205,7 @@ public class PerformanceQueues_p { if (threadName.equals("10_httpd")) { idlesleep = 0; busysleep = 0; memprereq = 0; } //if (threadName.equals(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) && (busysleep < 50)) busysleep = 50; - onTheFlyReconfiguration(switchboard, threadName, idlesleep, busysleep, memprereq); + onTheFlyReconfiguration(sb, threadName, idlesleep, busysleep, memprereq); } } prop.put("table_" + c + "_idlesleep", idlesleep); @@ -199,14 +214,14 @@ public class PerformanceQueues_p { // disallow setting of memprereq for indexer to prevent db from throwing OOMs prop.put("table_" + c + "_disabled", /*(threadName.endsWith("_indexing")) ? 1 :*/ "0"); prop.put("table_" + c + "_recommendation", threadName.endsWith("_indexing") ? "1" : "0"); - prop.putNum("table_" + c + "_recommendation_value", threadName.endsWith("_indexing") ? (switchboard.indexSegment.termIndex().minMem() / 1024) : 0); + prop.putNum("table_" + c + "_recommendation_value", threadName.endsWith("_indexing") ? (indexSegment.termIndex().minMem() / 1024) : 0); c++; } prop.put("table", c); // performance profiles c = 0; - final String usedfile = switchboard.getConfig("performanceProfile", "defaults/yacy.init"); + final String usedfile = sb.getConfig("performanceProfile", "defaults/yacy.init"); for(final String filename: performanceProfiles.keySet()) { prop.put("profile_" + c + "_filename", filename); prop.put("profile_" + c + "_description", performanceProfiles.get(filename)); @@ -217,7 +232,7 @@ public class PerformanceQueues_p { c = 0; final int[] speedValues = {200,150,100,50,25,10}; - final int usedspeed = Integer.parseInt(switchboard.getConfig("performanceSpeed", "100")); + final int usedspeed = Integer.parseInt(sb.getConfig("performanceSpeed", "100")); for(final int speed: speedValues){ prop.put("speed_" + c + "_value", speed); prop.put("speed_" + c + "_label", speed + " %"); @@ -228,8 +243,8 @@ public class PerformanceQueues_p { if ((post != null) && (post.containsKey("cacheSizeSubmit"))) { final int wordCacheMaxCount = post.getInt("wordCacheMaxCount", 20000); - switchboard.setConfig(SwitchboardConstants.WORDCACHE_MAX_COUNT, Integer.toString(wordCacheMaxCount)); - switchboard.indexSegment.termIndex().setBufferMaxWordCount(wordCacheMaxCount); + sb.setConfig(SwitchboardConstants.WORDCACHE_MAX_COUNT, Integer.toString(wordCacheMaxCount)); + indexSegment.termIndex().setBufferMaxWordCount(wordCacheMaxCount); } if ((post != null) && (post.containsKey("poolConfig"))) { @@ -237,17 +252,17 @@ public class PerformanceQueues_p { /* * configuring the crawler pool */ - // getting the current crawler pool configuration + // get the current crawler pool configuration int maxBusy = Integer.parseInt(post.get("Crawler Pool_maxActive","8")); // storing the new values into configfile - switchboard.setConfig(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX,maxBusy); + sb.setConfig(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX,maxBusy); //switchboard.setConfig("crawler.MinIdleThreads",minIdle); /* * configuring the http pool */ - final serverThread httpd = switchboard.getThread("10_httpd"); + final serverThread httpd = sb.getThread("10_httpd"); try { maxBusy = Integer.parseInt(post.get("httpd Session Pool_maxActive","8")); } catch (final NumberFormatException e) { @@ -257,61 +272,61 @@ public class PerformanceQueues_p { ((serverCore)httpd).setMaxSessionCount(maxBusy); // storing the new values into configfile - switchboard.setConfig("httpdMaxBusySessions",maxBusy); + sb.setConfig("httpdMaxBusySessions",maxBusy); } if ((post != null) && (post.containsKey("PrioritySubmit"))) { - switchboard.setConfig("javastart_priority",post.get("YaCyPriority","0")); + sb.setConfig("javastart_priority",post.get("YaCyPriority","0")); } if ((post != null) && (post.containsKey("onlineCautionSubmit"))) { - switchboard.setConfig(SwitchboardConstants.PROXY_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseProxy", 30000))); - switchboard.setConfig(SwitchboardConstants.LOCALSEACH_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseLocalsearch", 30000))); - switchboard.setConfig(SwitchboardConstants.REMOTESEARCH_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseRemotesearch", 30000))); + sb.setConfig(SwitchboardConstants.PROXY_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseProxy", 30000))); + sb.setConfig(SwitchboardConstants.LOCALSEACH_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseLocalsearch", 30000))); + sb.setConfig(SwitchboardConstants.REMOTESEARCH_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseRemotesearch", 30000))); } if ((post != null) && (post.containsKey("minimumDeltaSubmit"))) { - final long minimumLocalDelta = post.getLong("minimumLocalDelta", switchboard.crawlQueues.noticeURL.getMinimumLocalDelta()); - final long minimumGlobalDelta = post.getLong("minimumGlobalDelta", switchboard.crawlQueues.noticeURL.getMinimumGlobalDelta()); - switchboard.setConfig("minimumLocalDelta", minimumLocalDelta); - switchboard.setConfig("minimumGlobalDelta", minimumGlobalDelta); - switchboard.crawlQueues.noticeURL.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); + final long minimumLocalDelta = post.getLong("minimumLocalDelta", sb.crawlQueues.noticeURL.getMinimumLocalDelta()); + final long minimumGlobalDelta = post.getLong("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta()); + sb.setConfig("minimumLocalDelta", minimumLocalDelta); + sb.setConfig("minimumGlobalDelta", minimumGlobalDelta); + sb.crawlQueues.noticeURL.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); } // delta settings - prop.put("minimumLocalDelta", switchboard.crawlQueues.noticeURL.getMinimumLocalDelta()); - prop.put("minimumGlobalDelta", switchboard.crawlQueues.noticeURL.getMinimumGlobalDelta()); + prop.put("minimumLocalDelta", sb.crawlQueues.noticeURL.getMinimumLocalDelta()); + prop.put("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta()); // table cache settings - prop.putNum("urlCacheSize", switchboard.indexSegment.urlMetadata().writeCacheSize()); - prop.putNum("wordCacheSize", switchboard.indexSegment.termIndex().getBufferSize()); - prop.putNum("wordCacheSizeKBytes", switchboard.indexSegment.termIndex().getBufferSizeBytes()/1024); - prop.putNum("maxURLinCache", switchboard.indexSegment.termIndex().getBufferMaxReferences()); - prop.putNum("maxAgeOfCache", switchboard.indexSegment.termIndex().getBufferMaxAge() / 1000 / 60); // minutes - prop.putNum("minAgeOfCache", switchboard.indexSegment.termIndex().getBufferMinAge() / 1000 / 60); // minutes - prop.putNum("maxWaitingWordFlush", switchboard.getConfigLong("maxWaitingWordFlush", 180)); - prop.put("wordCacheMaxCount", switchboard.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 20000)); - prop.put("crawlPauseProxy", switchboard.getConfigLong(SwitchboardConstants.PROXY_ONLINE_CAUTION_DELAY, 30000)); - prop.put("crawlPauseLocalsearch", switchboard.getConfigLong(SwitchboardConstants.LOCALSEACH_ONLINE_CAUTION_DELAY, 30000)); - prop.put("crawlPauseRemotesearch", switchboard.getConfigLong(SwitchboardConstants.REMOTESEARCH_ONLINE_CAUTION_DELAY, 30000)); - prop.putNum("crawlPauseProxyCurrent", (System.currentTimeMillis() - switchboard.proxyLastAccess) / 1000); - prop.putNum("crawlPauseLocalsearchCurrent", (System.currentTimeMillis() - switchboard.localSearchLastAccess) / 1000); - prop.putNum("crawlPauseRemotesearchCurrent", (System.currentTimeMillis() - switchboard.remoteSearchLastAccess) / 1000); + prop.putNum("urlCacheSize", indexSegment.urlMetadata().writeCacheSize()); + prop.putNum("wordCacheSize", indexSegment.termIndex().getBufferSize()); + prop.putNum("wordCacheSizeKBytes", indexSegment.termIndex().getBufferSizeBytes()/1024); + prop.putNum("maxURLinCache", indexSegment.termIndex().getBufferMaxReferences()); + prop.putNum("maxAgeOfCache", indexSegment.termIndex().getBufferMaxAge() / 1000 / 60); // minutes + prop.putNum("minAgeOfCache", indexSegment.termIndex().getBufferMinAge() / 1000 / 60); // minutes + prop.putNum("maxWaitingWordFlush", sb.getConfigLong("maxWaitingWordFlush", 180)); + prop.put("wordCacheMaxCount", sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 20000)); + prop.put("crawlPauseProxy", sb.getConfigLong(SwitchboardConstants.PROXY_ONLINE_CAUTION_DELAY, 30000)); + prop.put("crawlPauseLocalsearch", sb.getConfigLong(SwitchboardConstants.LOCALSEACH_ONLINE_CAUTION_DELAY, 30000)); + prop.put("crawlPauseRemotesearch", sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_ONLINE_CAUTION_DELAY, 30000)); + prop.putNum("crawlPauseProxyCurrent", (System.currentTimeMillis() - sb.proxyLastAccess) / 1000); + prop.putNum("crawlPauseLocalsearchCurrent", (System.currentTimeMillis() - sb.localSearchLastAccess) / 1000); + prop.putNum("crawlPauseRemotesearchCurrent", (System.currentTimeMillis() - sb.remoteSearchLastAccess) / 1000); // table thread pool settings prop.put("pool_0_name","Crawler Pool"); - prop.put("pool_0_maxActive", switchboard.getConfigLong("crawler.MaxActiveThreads", 0)); - prop.put("pool_0_numActive",switchboard.crawlQueues.size()); + prop.put("pool_0_maxActive", sb.getConfigLong("crawler.MaxActiveThreads", 0)); + prop.put("pool_0_numActive",sb.crawlQueues.size()); - final serverThread httpd = switchboard.getThread("10_httpd"); + final serverThread httpd = sb.getThread("10_httpd"); prop.put("pool_1_name", "httpd Session Pool"); prop.put("pool_1_maxActive", ((serverCore)httpd).getMaxSessionCount()); prop.put("pool_1_numActive", ((serverCore)httpd).getJobCount()); prop.put("pool", "2"); - final long curr_prio = switchboard.getConfigLong("javastart_priority",0); + final long curr_prio = sb.getConfigLong("javastart_priority",0); prop.put("priority_normal",(curr_prio==0) ? "1" : "0"); prop.put("priority_below",(curr_prio==10) ? "1" : "0"); prop.put("priority_low",(curr_prio==20) ? "1" : "0"); diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 372b72f5f..643d6f9e5 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -38,6 +38,8 @@ import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.retrieval.Request; import de.anomic.http.metadata.HeaderFramework; import de.anomic.http.metadata.RequestHeader; +import de.anomic.kelondro.text.Segment; +import de.anomic.kelondro.text.Segments; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -58,11 +60,23 @@ public class QuickCrawlLink_p { final serverObjects prop = new serverObjects(); final Switchboard sb = (Switchboard) env; + // get segment + Segment indexSegment = null; + if (post != null && post.containsKey("segment")) { + String segmentName = post.get("segment"); + if (sb.indexSegments.segmentExist(segmentName)) { + indexSegment = sb.indexSegments.segment(segmentName); + } + } else { + // take default segment + indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC); + } + if (post == null) { // send back usage example prop.put("mode", "0"); - // getting the http host header + // get the http host header final String hostSocket = header.get(HeaderFramework.CONNECTION_PROP_HOST); //String host = hostSocket; @@ -80,7 +94,7 @@ public class QuickCrawlLink_p { } prop.put("mode", "1"); - // getting the URL + // get the URL String crawlingStart = post.get("url",null); try { crawlingStart = URLDecoder.decode(crawlingStart, "UTF-8"); @@ -89,10 +103,10 @@ public class QuickCrawlLink_p { e1.printStackTrace(); } - // getting the browser title + // get the browser title final String title = post.get("title",null); - // getting other parameters if set + // get other parameters if set final String crawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL); final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_BAD_URL); final int CrawlingDepth = Integer.parseInt(post.get("crawlingDepth", "0")); @@ -123,7 +137,7 @@ public class QuickCrawlLink_p { } final String urlhash = crawlingStartURL.hash(); - sb.indexSegment.urlMetadata().remove(urlhash); + indexSegment.urlMetadata().remove(urlhash); sb.crawlQueues.noticeURL.removeByURLHash(urlhash); sb.crawlQueues.errorURL.remove(urlhash); diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java index ce93b654b..f5100c9ee 100644 --- a/htroot/SettingsAck_p.java +++ b/htroot/SettingsAck_p.java @@ -345,7 +345,7 @@ public class SettingsAck_p { } if (post.containsKey("seedSettings")) { - // getting the currently used uploading method + // get the currently used uploading method final String oldSeedUploadMethod = env.getConfig("seedUploadMethod","none"); final String newSeedUploadMethod = post.get("seedUploadMethod"); final String oldSeedURLStr = sb.peers.mySeed().get(yacySeed.SEEDLIST, ""); @@ -385,7 +385,7 @@ public class SettingsAck_p { final HashMap uploaders = yacyCore.getSeedUploadMethods(); final Iterator uploaderKeys = uploaders.keySet().iterator(); while (uploaderKeys.hasNext()) { - // getting the uploader module name + // get the uploader module name final String uploaderName = uploaderKeys.next(); @@ -452,7 +452,7 @@ public class SettingsAck_p { // Crawler settings if (post.containsKey("crawlerSettings")) { - // getting Crawler Timeout + // get Crawler Timeout String timeoutStr = post.get("crawler.clientTimeout"); if (timeoutStr==null||timeoutStr.length()==0) timeoutStr = "10000"; @@ -467,7 +467,7 @@ public class SettingsAck_p { return prop; } - // getting maximum http file size + // get maximum http file size String maxSizeStr = post.get("crawler.http.maxFileSize"); if (maxSizeStr==null||maxSizeStr.length()==0) timeoutStr = "-1"; @@ -484,7 +484,7 @@ public class SettingsAck_p { return prop; } - // getting maximum ftp file size + // get maximum ftp file size maxSizeStr = post.get("crawler.ftp.maxFileSize"); if (maxSizeStr==null||maxSizeStr.length()==0) timeoutStr = "-1"; diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index ae5db1575..24d63c8a4 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -45,6 +45,8 @@ import de.anomic.http.client.Client; import de.anomic.http.client.Cache; import de.anomic.http.metadata.RequestHeader; import de.anomic.http.metadata.ResponseHeader; +import de.anomic.kelondro.text.Segment; +import de.anomic.kelondro.text.Segments; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.FileUtils; import de.anomic.search.Switchboard; @@ -72,6 +74,18 @@ public class ViewFile { final int display = (post == null) ? 0 : post.getInt("display", 0); + // get segment + Segment indexSegment = null; + if (post != null && post.containsKey("segment")) { + String segmentName = post.get("segment"); + if (sb.indexSegments.segmentExist(segmentName)) { + indexSegment = sb.indexSegments.segment(segmentName); + } + } else { + // take default segment + indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC); + } + prop.put("display", display); prop.put("error_display", display); @@ -90,12 +104,12 @@ public class ViewFile { int size = 0; boolean pre = false; - // getting the url hash from which the content should be loaded + // get the url hash from which the content should be loaded final String urlHash = post.get("urlHash",""); if (urlHash.length() > 0) { - // getting the urlEntry that belongs to the url hash + // get the urlEntry that belongs to the url hash URLMetadataRow urlEntry = null; - urlEntry = sb.indexSegment.urlMetadata().load(urlHash, null, 0); + urlEntry = indexSegment.urlMetadata().load(urlHash, null, 0); if (urlEntry == null) { prop.put("error", "2"); prop.put("viewMode",VIEW_MODE_NO_TEXT); diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java index 6f1e92fea..6f712be9f 100644 --- a/htroot/WatchCrawler_p.java +++ b/htroot/WatchCrawler_p.java @@ -45,6 +45,8 @@ import de.anomic.data.listManager; import de.anomic.document.parser.html.ContentScraper; import de.anomic.document.parser.html.TransformerWriter; import de.anomic.http.metadata.RequestHeader; +import de.anomic.kelondro.text.Segment; +import de.anomic.kelondro.text.Segments; import de.anomic.kelondro.util.FileUtils; import de.anomic.search.Switchboard; import de.anomic.search.SwitchboardConstants; @@ -86,6 +88,18 @@ public class WatchCrawler_p { prop.put("list-remote", 0); prop.put("forwardToCrawlStart", "0"); + // get segment + Segment indexSegment = null; + if (post != null && post.containsKey("segment")) { + String segmentName = post.get("segment"); + if (sb.indexSegments.segmentExist(segmentName)) { + indexSegment = sb.indexSegments.segment(segmentName); + } + } else { + // take default segment + indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC); + } + prop.put("info", "0"); if (post != null) { // a crawl start @@ -216,7 +230,7 @@ public class WatchCrawler_p { // first delete old entry, if exists final yacyURL url = new yacyURL(crawlingStart, null); final String urlhash = url.hash(); - sb.indexSegment.urlMetadata().remove(urlhash); + indexSegment.urlMetadata().remove(urlhash); sb.crawlQueues.noticeURL.removeByURLHash(urlhash); sb.crawlQueues.errorURL.remove(urlhash); diff --git a/htroot/WebStructurePicture_p.java b/htroot/WebStructurePicture_p.java index 637d3450f..121aec269 100644 --- a/htroot/WebStructurePicture_p.java +++ b/htroot/WebStructurePicture_p.java @@ -93,7 +93,7 @@ public class WebStructurePicture_p { try { hash = (new yacyURL("http://" + host, null)).hash().substring(6); } catch (final MalformedURLException e) {e.printStackTrace();} - assert (sb.webStructure.outgoingReferences(hash) != null); + //assert (sb.webStructure.outgoingReferences(hash) != null); // recursively find domains, up to a specific depth final ymageGraph graph = new ymageGraph(); diff --git a/htroot/api/queues_p.java b/htroot/api/queues_p.java index d5760a66f..bab89a094 100755 --- a/htroot/api/queues_p.java +++ b/htroot/api/queues_p.java @@ -6,6 +6,8 @@ import java.util.Locale; import de.anomic.crawler.NoticedURL; import de.anomic.crawler.retrieval.Request; import de.anomic.http.metadata.RequestHeader; +import de.anomic.kelondro.text.Segment; +import de.anomic.kelondro.text.Segments; import de.anomic.search.Switchboard; import de.anomic.search.SwitchboardConstants; import de.anomic.server.serverObjects; @@ -28,16 +30,22 @@ public class queues_p { final Switchboard sb = (Switchboard) env; //wikiCode wikiTransformer = new wikiCode(switchboard); final serverObjects prop = new serverObjects(); - if (post == null || !post.containsKey("html")) + Segment segment = null; + if (post == null || !post.containsKey("html")) { prop.setLocalized(false); + if (post.containsKey("segment") && sb.verifyAuthentication(header, false)) { + segment = sb.indexSegments.segment(post.get("segment")); + } + } + if (segment == null) segment = sb.indexSegments.segment(Segments.Process.PUBLIC); prop.put("rejected", "0"); //int showRejectedCount = 10; yacySeed initiator; // index size - prop.putNum("urlpublictextSize", sb.indexSegment.urlMetadata().size()); - prop.putNum("rwipublictextSize", sb.indexSegment.termIndex().sizesMax()); + prop.putNum("urlpublictextSize", segment.urlMetadata().size()); + prop.putNum("rwipublictextSize", segment.termIndex().sizesMax()); // loader queue prop.put("loaderSize", Integer.toString(sb.crawlQueues.size())); diff --git a/htroot/api/status_p.java b/htroot/api/status_p.java index 6bfcad6db..d8d46ad08 100644 --- a/htroot/api/status_p.java +++ b/htroot/api/status_p.java @@ -3,6 +3,8 @@ import de.anomic.http.io.ByteCountInputStream; import de.anomic.http.io.ByteCountOutputStream; import de.anomic.http.metadata.RequestHeader; +import de.anomic.kelondro.text.Segment; +import de.anomic.kelondro.text.Segments; import de.anomic.kelondro.util.MemoryControl; import de.anomic.search.Switchboard; import de.anomic.search.SwitchboardConstants; @@ -17,14 +19,21 @@ public class status_p { // return variable that accumulates replacements final Switchboard sb = (Switchboard) env; final serverObjects prop = new serverObjects(); - if (post == null || !post.containsKey("html")) + Segment segment = null; + if (post == null || !post.containsKey("html")) { prop.setLocalized(false); + if (post.containsKey("segment") && sb.verifyAuthentication(header, false)) { + segment = sb.indexSegments.segment(post.get("segment")); + } + } + if (segment == null) segment = sb.indexSegments.segment(Segments.Process.PUBLIC); + prop.put("rejected", "0"); sb.updateMySeed(); final int cacheMaxSize = (int) sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 10000); prop.putNum("ppm", sb.currentPPM()); prop.putNum("qpm", sb.peers.mySeed().getQPM()); - prop.put("wordCacheSize", Integer.toString(sb.indexSegment.termIndex().getBufferSize())); + prop.put("wordCacheSize", Integer.toString(segment.termIndex().getBufferSize())); prop.put("wordCacheMaxSize", Integer.toString(cacheMaxSize)); // // memory usage and system attributes diff --git a/htroot/api/timeline.java b/htroot/api/timeline.java index 3020c9077..e84cfa74f 100644 --- a/htroot/api/timeline.java +++ b/htroot/api/timeline.java @@ -32,6 +32,7 @@ import de.anomic.document.Word; import de.anomic.http.metadata.RequestHeader; import de.anomic.kelondro.text.ReferenceContainer; import de.anomic.kelondro.text.Segment; +import de.anomic.kelondro.text.Segments; import de.anomic.kelondro.text.TermSearch; import de.anomic.kelondro.text.referencePrototype.WordReference; import de.anomic.kelondro.util.DateFormatter; @@ -52,6 +53,13 @@ public final class timeline { if ((post == null) || (env == null)) return prop; final boolean authenticated = sb.adminAuthenticated(header) >= 2; + Segment segment = null; + if (post.containsKey("segment") && authenticated) { + segment = sb.indexSegments.segment(post.get("segment")); + } else { + segment = sb.indexSegments.segment(Segments.Process.PUBLIC); + } + final String querystring = post.get("query", ""); // a string of word hashes that shall be searched and combined final int count = Math.min((authenticated) ? 1000 : 10, post.getInt("maximumRecords", 1000)); // SRU syntax final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE); @@ -80,7 +88,7 @@ public final class timeline { //yacyCore.log.logInfo("INIT TIMELINE SEARCH: " + plasmaSearchQuery.anonymizedQueryHashes(query[0]) + " - " + count + " links"); // get the index container with the result vector - final TermSearch search = sb.indexSegment.termIndex().query( + final TermSearch search = segment.termIndex().query( q, Word.words2hashes(query[1]), null, diff --git a/htroot/api/yacydoc.java b/htroot/api/yacydoc.java index 667eb2cb8..fa4eeab2c 100644 --- a/htroot/api/yacydoc.java +++ b/htroot/api/yacydoc.java @@ -28,6 +28,8 @@ import java.net.MalformedURLException; import de.anomic.http.metadata.RequestHeader; +import de.anomic.kelondro.text.Segment; +import de.anomic.kelondro.text.Segments; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; @@ -42,6 +44,15 @@ public class yacydoc { final Switchboard sb = (Switchboard) env; final serverObjects prop = new serverObjects(); + Segment segment = null; + if (post == null || !post.containsKey("html")) { + if (post.containsKey("segment") && sb.verifyAuthentication(header, false)) { + segment = sb.indexSegments.segment(post.get("segment")); + } + } + if (segment == null) segment = sb.indexSegments.segment(Segments.Process.PUBLIC); + + prop.put("dc_title", ""); prop.put("dc_creator", ""); prop.put("dc_description", ""); @@ -68,14 +79,14 @@ public class yacydoc { } if (urlhash == null || urlhash.length() == 0) return prop; - final URLMetadataRow entry = sb.indexSegment.urlMetadata().load(urlhash, null, 0); + final URLMetadataRow entry = segment.urlMetadata().load(urlhash, null, 0); if (entry == null) return prop; final URLMetadataRow.Components metadata = entry.metadata(); if (metadata.url() == null) { return prop; } - final URLMetadataRow le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : sb.indexSegment.urlMetadata().load(entry.referrerHash(), null, 0); + final URLMetadataRow le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : segment.urlMetadata().load(entry.referrerHash(), null, 0); prop.putXML("dc_title", metadata.dc_title()); prop.putXML("dc_creator", metadata.dc_creator()); diff --git a/htroot/autoconfig.java b/htroot/autoconfig.java index 65f0db0d4..d11970a0f 100644 --- a/htroot/autoconfig.java +++ b/htroot/autoconfig.java @@ -50,7 +50,7 @@ public class autoconfig { final boolean yacyonly = env.getConfigBool(SwitchboardConstants.PROXY_YACY_ONLY, false); - // getting the http host header + // get the http host header final String hostSocket = header.get(HeaderFramework.CONNECTION_PROP_HOST); String host = hostSocket; diff --git a/htroot/sharedBlacklist_p.java b/htroot/sharedBlacklist_p.java index c62f11899..2e562c213 100644 --- a/htroot/sharedBlacklist_p.java +++ b/htroot/sharedBlacklist_p.java @@ -73,7 +73,7 @@ public class sharedBlacklist_p { // return variable that accumulates replacements final serverObjects prop = new serverObjects(); - // getting the name of the destination blacklist + // get the name of the destination blacklist String selectedBlacklistName = ""; if( post != null && post.containsKey("currentBlacklist") ){ selectedBlacklistName = post.get("currentBlacklist"); @@ -113,7 +113,7 @@ public class sharedBlacklist_p { * Import blacklist from other peer * ====================================================== */ - // getting the source peer hash + // get the source peer hash final String Hash = post.get("hash"); // generate the download URL diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index 8a995a51f..731fd0a8d 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -31,6 +31,7 @@ import java.io.IOException; import de.anomic.crawler.ZURL; import de.anomic.http.metadata.RequestHeader; +import de.anomic.kelondro.text.Segments; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; @@ -136,7 +137,7 @@ public final class crawlReceipt { if (result.equals("fill")) try { // put new entry into database - sb.indexSegment.urlMetadata().store(entry); + sb.indexSegments.urlMetadata(Segments.Process.RECEIPTS).store(entry); sb.crawlResults.stack(entry, youare, iam, 1); sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work has been done log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + entry.hash() + ":" + metadata.url().toNormalform(false, true)); diff --git a/htroot/yacy/message.java b/htroot/yacy/message.java index 93783ac62..5e1a5d649 100644 --- a/htroot/yacy/message.java +++ b/htroot/yacy/message.java @@ -167,10 +167,10 @@ public final class message { try { if (!Boolean.valueOf(sb.getConfig("msgForwardingEnabled","false")).booleanValue()) return; - // getting the recipient address + // get the recipient address final String sendMailTo = sb.getConfig("msgForwardingTo","root@localhost").trim(); - // getting the sendmail configuration + // get the sendmail configuration final String sendMailStr = sb.getConfig("msgForwardingCmd","/usr/bin/sendmail")+" "+sendMailTo; final String[] sendMail = sendMailStr.trim().split(" "); diff --git a/htroot/yacy/query.java b/htroot/yacy/query.java index baf48b55d..857ff3900 100644 --- a/htroot/yacy/query.java +++ b/htroot/yacy/query.java @@ -31,6 +31,7 @@ import java.io.IOException; import de.anomic.http.metadata.HeaderFramework; import de.anomic.http.metadata.RequestHeader; +import de.anomic.kelondro.text.Segments; import de.anomic.kelondro.util.DateFormatter; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; @@ -85,7 +86,7 @@ public final class query { if (obj.equals("rwiurlcount")) try { // the total number of different urls in the rwi is returned // shall contain a word hash, the number of assigned lurls to this hash is returned - prop.put("response", sb.indexSegment.termIndex().get(env.getBytes(), null).size()); + prop.put("response", sb.indexSegments.termIndex(Segments.Process.PUBLIC).get(env.getBytes(), null).size()); return prop; } catch (IOException e) { e.printStackTrace(); @@ -93,13 +94,13 @@ public final class query { if (obj.equals("rwicount")) { // return the total number of available word indexes - prop.put("response", sb.indexSegment.termIndex().sizesMax()); + prop.put("response", sb.indexSegments.termIndex(Segments.Process.PUBLIC).sizesMax()); return prop; } if (obj.equals("lurlcount")) { // return the number of all available l-url's - prop.put("response", sb.indexSegment.urlMetadata().size()); + prop.put("response", sb.indexSegments.urlMetadata(Segments.Process.PUBLIC).size()); return prop; } diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index a8d71b7eb..09e3c195e 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -41,6 +41,7 @@ import de.anomic.http.metadata.RequestHeader; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.Bitfield; import de.anomic.kelondro.text.ReferenceContainer; +import de.anomic.kelondro.text.Segments; import de.anomic.kelondro.text.referencePrototype.WordReference; import de.anomic.kelondro.util.SortStack; import de.anomic.net.natLib; @@ -216,7 +217,7 @@ public final class search { final long timer = System.currentTimeMillis(); //final Map>[] containers = sb.indexSegment.index().searchTerm(theQuery.queryHashes, theQuery.excludeHashes, plasmaSearchQuery.hashes2StringSet(urls)); - final HashMap> incc = sb.indexSegment.termIndex().searchConjunction(theQuery.queryHashes, QueryParams.hashes2StringSet(urls)); + final HashMap> incc = sb.indexSegments.termIndex(Segments.Process.PUBLIC).searchConjunction(theQuery.queryHashes, QueryParams.hashes2StringSet(urls)); serverProfiling.update("SEARCH", new ProfilingGraph.searchEvent(theQuery.id(true), SearchEvent.COLLECTION, incc.size(), System.currentTimeMillis() - timer), false); if (incc != null) { @@ -269,7 +270,7 @@ public final class search { RSSFeed.channels(RSSFeed.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), QueryParams.anonymizedQueryHashes(theQuery.queryHashes), "")); // make event - theSearch = SearchEventCache.getEvent(theQuery, sb.indexSegment, sb.peers, sb.crawlResults, null, true); + theSearch = SearchEventCache.getEvent(theQuery, sb.indexSegments.segment(Segments.Process.PUBLIC), sb.peers, sb.crawlResults, null, true); // set statistic details of search result and find best result index set if (theSearch.getRankingResult().getLocalResourceSize() == 0) { diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index 531108685..da92662e8 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -36,6 +36,7 @@ import de.anomic.content.RSSMessage; import de.anomic.data.Blacklist; import de.anomic.document.parser.xml.RSSFeed; import de.anomic.http.metadata.RequestHeader; +import de.anomic.kelondro.text.Segments; import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.FileUtils; import de.anomic.search.Switchboard; @@ -111,9 +112,9 @@ public final class transferRWI { sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". Not granted. This peer is in robinson mode"); result = "not_granted"; pause = 60000; - } else if (sb.indexSegment.termIndex().getBufferSize() > cachelimit) { + } else if (sb.indexSegments.termIndex(Segments.Process.DHTIN).getBufferSize() > cachelimit) { // we are too busy to receive indexes - sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". We are too busy (buffersize=" + sb.indexSegment.termIndex().getBufferSize() + ")."); + sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". We are too busy (buffersize=" + sb.indexSegments.termIndex(Segments.Process.DHTIN).getBufferSize() + ")."); granted = false; // don't accept more words if there are too many words to flush result = "busy"; pause = 60000; @@ -180,7 +181,7 @@ public final class transferRWI { // learn entry try { - sb.indexSegment.termIndex().add(wordHash.getBytes(), iEntry); + sb.indexSegments.termIndex(Segments.Process.DHTIN).add(wordHash.getBytes(), iEntry); } catch (IOException e) { e.printStackTrace(); } @@ -188,7 +189,7 @@ public final class transferRWI { // check if we need to ask for the corresponding URL if (!(knownURL.contains(urlHash)||unknownURL.contains(urlHash))) try { - if (sb.indexSegment.urlMetadata().exists(urlHash)) { + if (sb.indexSegments.urlMetadata(Segments.Process.DHTIN).exists(urlHash)) { knownURL.add(urlHash); } else { unknownURL.add(urlHash); @@ -221,7 +222,7 @@ public final class transferRWI { } result = "ok"; - pause = (int) (sb.indexSegment.termIndex().getBufferSize() * 20000 / sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 100000)); // estimation of necessary pause time + pause = (int) (sb.indexSegments.termIndex(Segments.Process.DHTIN).getBufferSize() * 20000 / sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 100000)); // estimation of necessary pause time } prop.put("unknownURL", unknownURLs.toString()); diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index 57d6661f1..a80a349ea 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -33,6 +33,7 @@ import de.anomic.content.RSSMessage; import de.anomic.data.Blacklist; import de.anomic.document.parser.xml.RSSFeed; import de.anomic.http.metadata.RequestHeader; +import de.anomic.kelondro.text.Segments; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.DateFormatter; import de.anomic.search.Switchboard; @@ -44,7 +45,6 @@ import de.anomic.yacy.yacyNetwork; import de.anomic.yacy.yacySeed; public final class transferURL { - public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) throws InterruptedException { final long start = System.currentTimeMillis(); @@ -81,7 +81,7 @@ public final class transferURL { } else { int received = 0; int blocked = 0; - final int sizeBefore = sb.indexSegment.urlMetadata().size(); + final int sizeBefore = sb.indexSegments.urlMetadata(Segments.Process.DHTIN).size(); // read the urls from the other properties and store String urls; URLMetadataRow lEntry; @@ -139,7 +139,7 @@ public final class transferURL { // write entry to database yacyCore.log.logInfo("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.metadata().url().toNormalform(true, false)); try { - sb.indexSegment.urlMetadata().store(lEntry); + sb.indexSegments.urlMetadata(Segments.Process.DHTIN).store(lEntry); sb.crawlResults.stack(lEntry, iam, iam, 3); if (yacyCore.log.isFine()) yacyCore.log.logFine("transferURL: received URL '" + metadata.url().toNormalform(false, true) + "' from peer " + otherPeerName); received++; @@ -151,7 +151,7 @@ public final class transferURL { sb.peers.mySeed().incRU(received); // return rewrite properties - final int more = sb.indexSegment.urlMetadata().size() - sizeBefore; + final int more = sb.indexSegments.urlMetadata(Segments.Process.DHTIN).size() - sizeBefore; doublevalues = Integer.toString(received - more); yacyCore.log.logInfo("Received " + received + " URLs from peer " + otherPeerName + " in " + (System.currentTimeMillis() - start) + " ms, blocked " + blocked + " URLs"); RSSFeed.channels(RSSFeed.INDEXRECEIVE).addMessage(new RSSMessage("Received " + received + " URLs from peer " + otherPeerName + ", blocked " + blocked, "", "")); diff --git a/htroot/yacy/urls.java b/htroot/yacy/urls.java index c8ff72347..3049a0132 100644 --- a/htroot/yacy/urls.java +++ b/htroot/yacy/urls.java @@ -30,6 +30,7 @@ import java.util.Date; import de.anomic.crawler.NoticedURL; import de.anomic.crawler.retrieval.Request; import de.anomic.http.metadata.RequestHeader; +import de.anomic.kelondro.text.Segments; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.DateFormatter; import de.anomic.search.Switchboard; @@ -75,7 +76,7 @@ public class urls { if (entry == null) break; // find referrer, if there is one - referrer = sb.getURL(entry.referrerhash()); + referrer = sb.getURL(Segments.Process.PUBLIC, entry.referrerhash()); // place url to notice-url db sb.crawlQueues.delegatedURL.push( @@ -112,10 +113,10 @@ public class urls { URLMetadataRow.Components metadata; yacyURL referrer; for (int i = 0; i < count; i++) { - entry = sb.indexSegment.urlMetadata().load(urlhashes.substring(12 * i, 12 * (i + 1)), null, 0); + entry = sb.indexSegments.urlMetadata(Segments.Process.PUBLIC).load(urlhashes.substring(12 * i, 12 * (i + 1)), null, 0); if (entry == null) continue; // find referrer, if there is one - referrer = sb.getURL(entry.referrerHash()); + referrer = sb.getURL(Segments.Process.PUBLIC, entry.referrerHash()); // create RSS entry metadata = entry.metadata(); prop.put("item_" + c + "_title", metadata.dc_title()); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 9e1dfe47a..a8b0ad219 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -45,6 +45,8 @@ import de.anomic.document.parser.xml.RSSFeed; import de.anomic.http.metadata.HeaderFramework; import de.anomic.http.metadata.RequestHeader; import de.anomic.kelondro.order.Bitfield; +import de.anomic.kelondro.text.Segment; +import de.anomic.kelondro.text.Segments; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.MemoryControl; import de.anomic.kelondro.util.SetTools; @@ -96,11 +98,23 @@ public class yacysearch { boolean fetchSnippets = (post != null && post.get("verify", "false").equals("true")); final serverObjects prop = new serverObjects(); + // get segment + Segment indexSegment = null; + if (post != null && post.containsKey("segment")) { + String segmentName = post.get("segment"); + if (sb.indexSegments.segmentExist(segmentName)) { + indexSegment = sb.indexSegments.segment(segmentName); + } + } else { + // take default segment + indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC); + } + //final boolean rss = (post == null) ? false : post.get("rss", "false").equals("true"); prop.put("promoteSearchPageGreeting", promoteSearchPageGreeting); prop.put("promoteSearchPageGreeting.homepage", sb.getConfig(SwitchboardConstants.GREETING_HOMEPAGE, "")); prop.put("promoteSearchPageGreeting.smallImage", sb.getConfig(SwitchboardConstants.GREETING_SMALL_IMAGE, "")); - if ((post == null) || (env == null) || (!searchAllowed)) { + if (post == null || indexSegment == null || env == null || !searchAllowed) { // we create empty entries for template strings prop.put("searchagain", "0"); prop.put("display", display); @@ -235,7 +249,7 @@ public class yacysearch { // check available memory and clean up if necessary if (!MemoryControl.request(8000000L, false)) { - sb.indexSegment.urlMetadata().clearCache(); + indexSegment.urlMetadata().clearCache(); SearchEventCache.cleanupEvents(true); } @@ -374,7 +388,7 @@ public class yacysearch { // delete the index entry locally final String delHash = post.get("deleteref", ""); // urlhash - sb.indexSegment.termIndex().remove(Word.words2hashes(query[0]), delHash); + indexSegment.termIndex().remove(Word.words2hashes(query[0]), delHash); // make new news message with negative voting final HashMap map = new HashMap(); @@ -393,7 +407,7 @@ public class yacysearch { return prop; } final String recommendHash = post.get("recommendref", ""); // urlhash - final URLMetadataRow urlentry = sb.indexSegment.urlMetadata().load(recommendHash, null, 0); + final URLMetadataRow urlentry = indexSegment.urlMetadata().load(recommendHash, null, 0); if (urlentry != null) { final URLMetadataRow.Components metadata = urlentry.metadata(); Document document; @@ -462,7 +476,7 @@ public class yacysearch { theQuery.setOffset(0); // in case that this is a new search, always start without a offset offset = 0; } - final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.indexSegment, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false); + final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, indexSegment, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false); // generate result object //serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + (System.currentTimeMillis() - timestamp) + " ms"); @@ -490,7 +504,7 @@ public class yacysearch { } prop.put("meanCount", meanMax); if (meanMax > 0) { - DidYouMean didYouMean = new DidYouMean(sb.indexSegment.termIndex()); + DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex()); Iterator meanIt = didYouMean.getSuggestions(querystring, 300, 10).iterator(); int meanCount = 0; String suggestion; diff --git a/source/de/anomic/content/oai/PMHReader.java b/source/de/anomic/content/oai/PMHReader.java index 95e76b16a..6c6b66df3 100644 --- a/source/de/anomic/content/oai/PMHReader.java +++ b/source/de/anomic/content/oai/PMHReader.java @@ -71,6 +71,59 @@ public class PMHReader { } catch (InterruptedException e) {} } + public static StringBuilder escape(final String s) { + final int len = s.length(); + final StringBuilder sbuf = new StringBuilder(len + 10); + for (int i = 0; i < len; i++) { + final int ch = s.charAt(i); + if ('A' <= ch && ch <= 'Z') { // 'A'..'Z' + sbuf.append((char)ch); + } else if ('a' <= ch && ch <= 'z') { // 'a'..'z' + sbuf.append((char)ch); + } else if ('0' <= ch && ch <= '9') { // '0'..'9' + sbuf.append((char)ch); + } else if (ch == ' ') { // space + sbuf.append("%20"); + } else if (ch == '&' || ch == ':' // unreserved + || ch == '-' || ch == '_' + || ch == '.' || ch == '!' + || ch == '~' || ch == '*' + || ch == '\'' || ch == '(' + || ch == ')' || ch == ';') { + sbuf.append((char)ch); + } + } + return sbuf; + } + + public static String unescape(final String s) { + final int l = s.length(); + final StringBuilder sbuf = new StringBuilder(l); + int ch = -1; + int b, sumb = 0; + for (int i = 0, more = -1; i < l; i++) { + /* Get next byte b from URL segment s */ + switch (ch = s.charAt(i)) { + case '%': + if (i + 2 < l) { + ch = s.charAt(++i); + int hb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF; + ch = s.charAt(++i); + int lb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase ((char) ch) - 'a') & 0xF; + b = (hb << 4) | lb; + } else { + b = ch; + } + break; + case '+': + b = ' '; + break; + default: + b = ch; + } + } + return sbuf.toString(); + } public static void main(String[] args) { // get one server with // http://roar.eprints.org/index.php?action=csv diff --git a/source/de/anomic/crawler/RobotsEntry.java b/source/de/anomic/crawler/RobotsEntry.java new file mode 100644 index 000000000..25d42fbbc --- /dev/null +++ b/source/de/anomic/crawler/RobotsEntry.java @@ -0,0 +1,209 @@ +//RobotsEntry.java +//------------------------------------- +//part of YACY +//(C) by Michael Peter Christen; mc@yacy.net +//first published on http://www.anomic.de +//Frankfurt, Germany, 2004 +// +//This file is contributed by Martin Thelian +// [MC] moved some methods from robotsParser file that had been created by Alexander Schier to this class +// [MC] redesign: removed entry object from RobotsTxt Class into ths separate class + +//last major change: $LastChangedDate$ by $LastChangedBy$ +//Revision: $LastChangedRevision$ +// +//This program is free software; you can redistribute it and/or modify +//it under the terms of the GNU General Public License as published by +//the Free Software Foundation; either version 2 of the License, or +//(at your option) any later version. +// +//This program is distributed in the hope that it will be useful, +//but WITHOUT ANY WARRANTY; without even the implied warranty of +//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +//GNU General Public License for more details. +// +//You should have received a copy of the GNU General Public License +//along with this program; if not, write to the Free Software +//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.crawler; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.Map; + +public class RobotsEntry { + + public static final String ROBOTS_DB_PATH_SEPARATOR = ";"; + public static final String ALLOW_PATH_LIST = "allow"; + public static final String DISALLOW_PATH_LIST = "disallow"; + public static final String LOADED_DATE = "date"; + public static final String MOD_DATE = "modDate"; + public static final String ETAG = "etag"; + public static final String SITEMAP = "sitemap"; + public static final String CRAWL_DELAY = "crawlDelay"; + public static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis"; + + // this is a simple record structure that holds all properties of a single crawl start + Map mem; + private LinkedList allowPathList, denyPathList; + String hostName; + + public RobotsEntry(final String hostName, final Map mem) { + this.hostName = hostName.toLowerCase(); + this.mem = mem; + + if (this.mem.containsKey(DISALLOW_PATH_LIST)) { + this.denyPathList = new LinkedList(); + final String csPl = this.mem.get(DISALLOW_PATH_LIST); + if (csPl.length() > 0){ + final String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR); + if ((pathArray != null)&&(pathArray.length > 0)) { + this.denyPathList.addAll(Arrays.asList(pathArray)); + } + } + } else { + this.denyPathList = new LinkedList(); + } + if (this.mem.containsKey(ALLOW_PATH_LIST)) { + this.allowPathList = new LinkedList(); + final String csPl = this.mem.get(ALLOW_PATH_LIST); + if (csPl.length() > 0){ + final String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR); + if ((pathArray != null)&&(pathArray.length > 0)) { + this.allowPathList.addAll(Arrays.asList(pathArray)); + } + } + } else { + this.allowPathList = new LinkedList(); + } + } + + public RobotsEntry( + final String hostName, + final ArrayList allowPathList, + final ArrayList disallowPathList, + final Date loadedDate, + final Date modDate, + final String eTag, + final String sitemap, + final long crawlDelayMillis + ) { + if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException("The hostname is missing"); + + this.hostName = hostName.trim().toLowerCase(); + this.allowPathList = new LinkedList(); + this.denyPathList = new LinkedList(); + + this.mem = new HashMap(5); + if (loadedDate != null) this.mem.put(LOADED_DATE,Long.toString(loadedDate.getTime())); + if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime())); + if (eTag != null) this.mem.put(ETAG,eTag); + if (sitemap != null) this.mem.put(SITEMAP,sitemap); + if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, Long.toString(crawlDelayMillis)); + + if ((allowPathList != null)&&(allowPathList.size()>0)) { + this.allowPathList.addAll(allowPathList); + + final StringBuilder pathListStr = new StringBuilder(); + for (int i=0; i0)) { + this.denyPathList.addAll(disallowPathList); + + final StringBuilder pathListStr = new StringBuilder(); + for (int i=0; i pathIter = this.denyPathList.iterator(); + while (pathIter.hasNext()) { + final String nextPath = pathIter.next(); + + // disallow rule + if (path.startsWith(nextPath)) { + return true; + } + } + return false; + } + +} \ No newline at end of file diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index 2c0d34cc6..fade83cfb 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -32,11 +32,7 @@ import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.util.ArrayList; -import java.util.Arrays; import java.util.Date; -import java.util.HashMap; -import java.util.Iterator; -import java.util.LinkedList; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; @@ -109,12 +105,12 @@ public class RobotsTxt { return this.robotsTable.size(); } - private Entry getEntry(final String urlHostPort, final boolean fetchOnlineIfNotAvailableOrNotFresh) { + private RobotsEntry getEntry(final String urlHostPort, final boolean fetchOnlineIfNotAvailableOrNotFresh) { // this method will always return a non-null value - Entry robotsTxt4Host = null; + RobotsEntry robotsTxt4Host = null; try { final Map record = this.robotsTable.get(urlHostPort); - if (record != null) robotsTxt4Host = new Entry(urlHostPort, record); + if (record != null) robotsTxt4Host = new RobotsEntry(urlHostPort, record); } catch (final kelondroException e) { resetDatabase(); } catch (final IOException e) { @@ -143,7 +139,7 @@ public class RobotsTxt { // to complete a download try { final Map record = this.robotsTable.get(urlHostPort); - if (record != null) robotsTxt4Host = new Entry(urlHostPort, record); + if (record != null) robotsTxt4Host = new RobotsEntry(urlHostPort, record); } catch (final kelondroException e) { resetDatabase(); } catch (final IOException e) { @@ -185,7 +181,7 @@ public class RobotsTxt { // no robots.txt available, make an entry to prevent that the robots loading is done twice if (robotsTxt4Host == null) { // generate artificial entry - robotsTxt4Host = new Entry( + robotsTxt4Host = new RobotsEntry( urlHostPort, new ArrayList(), new ArrayList(), @@ -233,11 +229,11 @@ public class RobotsTxt { public long crawlDelayMillis(final yacyURL theURL) { final String urlHostPort = getHostPort(theURL); - final RobotsTxt.Entry robotsEntry = getEntry(urlHostPort, true); + final RobotsEntry robotsEntry = getEntry(urlHostPort, true); return robotsEntry.getCrawlDelayMillis(); } - private Entry addEntry( + private RobotsEntry addEntry( final String hostName, final ArrayList allowPathList, final ArrayList denyPathList, @@ -247,14 +243,14 @@ public class RobotsTxt { final String sitemap, final long crawlDelayMillis ) { - final Entry entry = new Entry( + final RobotsEntry entry = new RobotsEntry( hostName, allowPathList, denyPathList, loadedDate, modDate, eTag, sitemap, crawlDelayMillis); addEntry(entry); return entry; } - private String addEntry(final Entry entry) { + private String addEntry(final RobotsEntry entry) { // writes a new page and returns key try { this.robotsTable.put(entry.hostName, entry.mem); @@ -264,176 +260,6 @@ public class RobotsTxt { } } - public static class Entry { - public static final String ALLOW_PATH_LIST = "allow"; - public static final String DISALLOW_PATH_LIST = "disallow"; - public static final String LOADED_DATE = "date"; - public static final String MOD_DATE = "modDate"; - public static final String ETAG = "etag"; - public static final String SITEMAP = "sitemap"; - public static final String CRAWL_DELAY = "crawlDelay"; - public static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis"; - - // this is a simple record structure that holds all properties of a single crawl start - Map mem; - private LinkedList allowPathList, denyPathList; - String hostName; - - public Entry(final String hostName, final Map mem) { - this.hostName = hostName.toLowerCase(); - this.mem = mem; - - if (this.mem.containsKey(DISALLOW_PATH_LIST)) { - this.denyPathList = new LinkedList(); - final String csPl = this.mem.get(DISALLOW_PATH_LIST); - if (csPl.length() > 0){ - final String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR); - if ((pathArray != null)&&(pathArray.length > 0)) { - this.denyPathList.addAll(Arrays.asList(pathArray)); - } - } - } else { - this.denyPathList = new LinkedList(); - } - if (this.mem.containsKey(ALLOW_PATH_LIST)) { - this.allowPathList = new LinkedList(); - final String csPl = this.mem.get(ALLOW_PATH_LIST); - if (csPl.length() > 0){ - final String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR); - if ((pathArray != null)&&(pathArray.length > 0)) { - this.allowPathList.addAll(Arrays.asList(pathArray)); - } - } - } else { - this.allowPathList = new LinkedList(); - } - } - - public Entry( - final String hostName, - final ArrayList allowPathList, - final ArrayList disallowPathList, - final Date loadedDate, - final Date modDate, - final String eTag, - final String sitemap, - final long crawlDelayMillis - ) { - if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException("The hostname is missing"); - - this.hostName = hostName.trim().toLowerCase(); - this.allowPathList = new LinkedList(); - this.denyPathList = new LinkedList(); - - this.mem = new HashMap(5); - if (loadedDate != null) this.mem.put(LOADED_DATE,Long.toString(loadedDate.getTime())); - if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime())); - if (eTag != null) this.mem.put(ETAG,eTag); - if (sitemap != null) this.mem.put(SITEMAP,sitemap); - if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, Long.toString(crawlDelayMillis)); - - if ((allowPathList != null)&&(allowPathList.size()>0)) { - this.allowPathList.addAll(allowPathList); - - final StringBuilder pathListStr = new StringBuilder(); - for (int i=0; i0)) { - this.denyPathList.addAll(disallowPathList); - - final StringBuilder pathListStr = new StringBuilder(); - for (int i=0; i pathIter = this.denyPathList.iterator(); - while (pathIter.hasNext()) { - final String nextPath = pathIter.next(); - - // disallow rule - if (path.startsWith(nextPath)) { - return true; - } - } - return false; - } - - } - // methods that had been in robotsParser.java: public static final int DOWNLOAD_ACCESS_RESTRICTED = 0; @@ -469,7 +295,7 @@ public class RobotsTxt { // generating the hostname:poart string needed to do a DB lookup final String urlHostPort = getHostPort(theURL); - final RobotsTxt.Entry robotsTxt4Host = this.getEntry(urlHostPort, true); + final RobotsEntry robotsTxt4Host = this.getEntry(urlHostPort, true); try { final String sitemapUrlStr = robotsTxt4Host.getSitemap(); @@ -485,7 +311,7 @@ public class RobotsTxt { // generating the hostname:poart string needed to do a DB lookup final String urlHostPort = getHostPort(theURL); - final RobotsTxt.Entry robotsTxt4Host = getEntry(urlHostPort, true); + final RobotsEntry robotsTxt4Host = getEntry(urlHostPort, true); try { crawlDelay = robotsTxt4Host.getCrawlDelayMillis(); @@ -499,12 +325,12 @@ public class RobotsTxt { // generating the hostname:port string needed to do a DB lookup final String urlHostPort = getHostPort(nexturl); - RobotsTxt.Entry robotsTxt4Host = null; + RobotsEntry robotsTxt4Host = null; robotsTxt4Host = getEntry(urlHostPort, true); return robotsTxt4Host.isDisallowed(nexturl.getFile()); } - private static Object[] downloadRobotsTxt(final yacyURL robotsURL, int redirectionCount, final RobotsTxt.Entry entry) throws Exception { + private static Object[] downloadRobotsTxt(final yacyURL robotsURL, int redirectionCount, final RobotsEntry entry) throws Exception { if (redirectionCount < 0) return new Object[]{Boolean.FALSE,null,null}; redirectionCount--; diff --git a/source/de/anomic/crawler/retrieval/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java index d3ee175d6..60c130961 100644 --- a/source/de/anomic/crawler/retrieval/FTPLoader.java +++ b/source/de/anomic/crawler/retrieval/FTPLoader.java @@ -37,6 +37,7 @@ import de.anomic.document.Parser; import de.anomic.http.metadata.HeaderFramework; import de.anomic.http.metadata.RequestHeader; import de.anomic.http.metadata.ResponseHeader; +import de.anomic.kelondro.text.Segments; import de.anomic.kelondro.util.DateFormatter; import de.anomic.net.ftpc; import de.anomic.search.Switchboard; @@ -108,7 +109,7 @@ public class FTPLoader { if (file.length() == 0) { // directory -> get list of files RequestHeader requestHeader = new RequestHeader(); - if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(request.referrerhash()).toNormalform(true, false)); + if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()).toNormalform(true, false)); byte[] dirList = generateDirlist(ftpClient, request, path); @@ -242,7 +243,7 @@ public class FTPLoader { // create a cache entry RequestHeader requestHeader = new RequestHeader(); - if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(request.referrerhash()).toNormalform(true, false)); + if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()).toNormalform(true, false)); ResponseHeader responseHeader = new ResponseHeader(); responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(fileDate)); responseHeader.put(HeaderFramework.CONTENT_TYPE, mimeType); diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index d46d36824..f190bae27 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -35,6 +35,7 @@ import de.anomic.http.client.Client; import de.anomic.http.metadata.HeaderFramework; import de.anomic.http.metadata.RequestHeader; import de.anomic.http.metadata.ResponseContainer; +import de.anomic.kelondro.text.Segments; import de.anomic.search.Switchboard; import de.anomic.yacy.yacyURL; import de.anomic.yacy.logging.Log; @@ -118,7 +119,7 @@ public final class HTTPLoader { final RequestHeader requestHeader = new RequestHeader(); requestHeader.put(HeaderFramework.USER_AGENT, crawlerUserAgent); yacyURL refererURL = null; - if (request.referrerhash() != null) refererURL = sb.getURL(request.referrerhash()); + if (request.referrerhash() != null) refererURL = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()); if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true)); requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE)); requestHeader.put(HeaderFramework.ACCEPT_CHARSET, sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET)); @@ -196,7 +197,7 @@ public final class HTTPLoader { final String urlhash = redirectionUrl.hash(); // check if the url was already indexed - final String dbname = sb.urlExists(urlhash); + final String dbname = sb.urlExists(Segments.Process.LOCALCRAWLING, urlhash); if (dbname != null) { sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "redirection to double content"); throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname); diff --git a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java index fcee4500a..a40a2fabb 100644 --- a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java +++ b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java @@ -43,6 +43,7 @@ import de.anomic.http.client.Cache; import de.anomic.http.metadata.HeaderFramework; import de.anomic.http.metadata.RequestHeader; import de.anomic.http.metadata.ResponseHeader; +import de.anomic.kelondro.text.Segments; import de.anomic.search.Switchboard; import de.anomic.server.serverCore; import de.anomic.yacy.yacyURL; @@ -167,7 +168,7 @@ public final class LoaderDispatcher { final RequestHeader requestHeader = new RequestHeader(); requestHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent); yacyURL refererURL = null; - if (request.referrerhash() != null) refererURL = sb.getURL(request.referrerhash()); + if (request.referrerhash() != null) refererURL = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()); if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true)); Response response = new Response( request, diff --git a/source/de/anomic/data/SitemapParser.java b/source/de/anomic/data/SitemapParser.java index 9df7bd1d7..b3c3f03de 100644 --- a/source/de/anomic/data/SitemapParser.java +++ b/source/de/anomic/data/SitemapParser.java @@ -46,6 +46,7 @@ import de.anomic.http.io.ByteCountInputStream; import de.anomic.http.metadata.HeaderFramework; import de.anomic.http.metadata.RequestHeader; import de.anomic.http.metadata.ResponseContainer; +import de.anomic.kelondro.text.Segments; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.DateFormatter; import de.anomic.search.Switchboard; @@ -258,10 +259,10 @@ public class SitemapParser extends DefaultHandler { // check if the url is known and needs to be recrawled if (this.lastMod != null) { - final String dbocc = this.sb.urlExists(nexturlhash); + final String dbocc = this.sb.urlExists(Segments.Process.LOCALCRAWLING, nexturlhash); if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) { // the url was already loaded. we need to check the date - final URLMetadataRow oldEntry = this.sb.indexSegment.urlMetadata().load(nexturlhash, null, 0); + final URLMetadataRow oldEntry = this.sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(nexturlhash, null, 0); if (oldEntry != null) { final Date modDate = oldEntry.moddate(); // check if modDate is null diff --git a/source/de/anomic/data/URLAnalysis.java b/source/de/anomic/data/URLAnalysis.java index 74f98a1b1..0b420e751 100644 --- a/source/de/anomic/data/URLAnalysis.java +++ b/source/de/anomic/data/URLAnalysis.java @@ -411,8 +411,8 @@ public class URLAnalysis { public static int diffurlcol(String metadataPath, String statisticFile, String diffFile) throws IOException { System.out.println("INDEX DIFF URL-COL startup"); - HandleMap idx = new HandleMap(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, 4, new File(statisticFile), 0); - MetadataRepository mr = new MetadataRepository(new File(metadataPath), false, false); + HandleMap idx = new HandleMap(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, 4, new File(statisticFile), 0); + MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false); HandleSet hs = new HandleSet(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, 0, 1000000); System.out.println("INDEX DIFF URL-COL loaded dump, starting diff"); long start = System.currentTimeMillis(); @@ -438,8 +438,8 @@ public class URLAnalysis { public static void export(String metadataPath, int format, String export, String diffFile) throws IOException { // format: 0=text, 1=html, 2=rss/xml - System.out.println("URL EXPORT startup"); - MetadataRepository mr = new MetadataRepository(new File(metadataPath), false, false); + System.out.println("URL EXPORT startup"); + MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false); HandleSet hs = (diffFile == null) ? null : new HandleSet(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, new File(diffFile), 0); System.out.println("URL EXPORT loaded dump, starting export"); Export e = mr.export(new File(export), ".*", hs, format, false); @@ -452,8 +452,8 @@ public class URLAnalysis { } public static void delete(String metadataPath, String diffFile) throws IOException { - System.out.println("URL DELETE startup"); - MetadataRepository mr = new MetadataRepository(new File(metadataPath), false, false); + System.out.println("URL DELETE startup"); + MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false); int mrSize = mr.size(); HandleSet hs = new HandleSet(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, new File(diffFile), 0); System.out.println("URL DELETE loaded dump, starting deletion of " + hs.size() + " entries from " + mrSize); diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java index b80b661d8..a89e8d434 100644 --- a/source/de/anomic/data/bookmarksDB.java +++ b/source/de/anomic/data/bookmarksDB.java @@ -70,6 +70,7 @@ import de.anomic.kelondro.blob.Heap; import de.anomic.kelondro.blob.MapView; import de.anomic.kelondro.order.CloneableIterator; import de.anomic.kelondro.order.NaturalOrder; +import de.anomic.kelondro.text.Segments; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.kelondroException; import de.anomic.kelondro.util.FileUtils; @@ -255,7 +256,7 @@ public class bookmarksDB { Pattern.compile(newcrawlingMustMatch); String urlhash = crawlingStartURL.hash(); - sb.indexSegment.urlMetadata().remove(urlhash); + sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).remove(urlhash); sb.crawlQueues.noticeURL.removeByURLHash(urlhash); sb.crawlQueues.errorURL.remove(urlhash); diff --git a/source/de/anomic/http/server/servlets/transferURL.java b/source/de/anomic/http/server/servlets/transferURL.java index ad6efce4f..2336d1685 100644 --- a/source/de/anomic/http/server/servlets/transferURL.java +++ b/source/de/anomic/http/server/servlets/transferURL.java @@ -1,6 +1,5 @@ // this is a temporary 1-to-1 copy of the transferURL servlet - package de.anomic.http.server.servlets; import java.io.IOException; @@ -10,6 +9,7 @@ import de.anomic.content.RSSMessage; import de.anomic.data.Blacklist; import de.anomic.document.parser.xml.RSSFeed; import de.anomic.http.metadata.RequestHeader; +import de.anomic.kelondro.text.Segments; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.DateFormatter; import de.anomic.search.Switchboard; @@ -20,32 +20,32 @@ import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacyNetwork; import de.anomic.yacy.yacySeed; -public final class transferURL { - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + +public final class transferURL { + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) throws InterruptedException { final long start = System.currentTimeMillis(); long freshdate = 0; @@ -81,7 +81,7 @@ public final class transferURL { } else { int received = 0; int blocked = 0; - final int sizeBefore = sb.indexSegment.urlMetadata().size(); + final int sizeBefore = sb.indexSegments.urlMetadata(Segments.Process.DHTIN).size(); // read the urls from the other properties and store String urls; URLMetadataRow lEntry; @@ -139,7 +139,7 @@ public final class transferURL { // write entry to database yacyCore.log.logInfo("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.metadata().url().toNormalform(true, false)); try { - sb.indexSegment.urlMetadata().store(lEntry); + sb.indexSegments.urlMetadata(Segments.Process.DHTIN).store(lEntry); sb.crawlResults.stack(lEntry, iam, iam, 3); if (yacyCore.log.isFine()) yacyCore.log.logFine("transferURL: received URL '" + metadata.url().toNormalform(false, true) + "' from peer " + otherPeerName); received++; @@ -151,7 +151,7 @@ public final class transferURL { sb.peers.mySeed().incRU(received); // return rewrite properties - final int more = sb.indexSegment.urlMetadata().size() - sizeBefore; + final int more = sb.indexSegments.urlMetadata(Segments.Process.DHTIN).size() - sizeBefore; doublevalues = Integer.toString(received - more); yacyCore.log.logInfo("Received " + received + " URLs from peer " + otherPeerName + " in " + (System.currentTimeMillis() - start) + " ms, blocked " + blocked + " URLs"); RSSFeed.channels(RSSFeed.INDEXRECEIVE).addMessage(new RSSMessage("Received " + received + " URLs from peer " + otherPeerName + ", blocked " + blocked, "", "")); @@ -163,4 +163,4 @@ public final class transferURL { prop.put("result", result); return prop; } -} +} \ No newline at end of file diff --git a/source/de/anomic/kelondro/blob/ArrayStack.java b/source/de/anomic/kelondro/blob/ArrayStack.java index ae39423ac..68cd5c372 100755 --- a/source/de/anomic/kelondro/blob/ArrayStack.java +++ b/source/de/anomic/kelondro/blob/ArrayStack.java @@ -450,6 +450,7 @@ public class ArrayStack implements BLOB { * @return the number of entries in each blob */ public synchronized int[] sizes() { + if (blobs == null) return new int[0]; int[] s = new int[blobs.size()]; int c = 0; for (blobItem bi: blobs) s[c++] = bi.blob.size(); diff --git a/source/de/anomic/kelondro/blob/Compressor.java b/source/de/anomic/kelondro/blob/Compressor.java index 2d7edf4da..ff46a6234 100644 --- a/source/de/anomic/kelondro/blob/Compressor.java +++ b/source/de/anomic/kelondro/blob/Compressor.java @@ -126,7 +126,7 @@ public class Compressor implements BLOB { private byte[] decompress(byte[] b) { // use a magic in the head of the bytes to identify compression type if (b == null) return null; - if (ByteArray.equals(b, gzipMagic)) { + if (ByteArray.startsWith(b, gzipMagic)) { //System.out.print("\\"); // DEBUG cdr--; ByteArrayInputStream bais = new ByteArrayInputStream(b); @@ -150,7 +150,7 @@ public class Compressor implements BLOB { e.printStackTrace(); return null; } - } else if (ByteArray.equals(b, plainMagic)) { + } else if (ByteArray.startsWith(b, plainMagic)) { //System.out.print("-"); // DEBUG byte[] r = new byte[b.length - 2]; System.arraycopy(b, 2, r, 0, b.length - 2); diff --git a/source/de/anomic/kelondro/index/Column.java b/source/de/anomic/kelondro/index/Column.java index 46d743cde..3606fdc84 100644 --- a/source/de/anomic/kelondro/index/Column.java +++ b/source/de/anomic/kelondro/index/Column.java @@ -173,7 +173,7 @@ public class Column { else if (this.celltype == celltype_boolean) this.encoder = encoder_bytes; else if (this.celltype == celltype_binary) this.encoder = encoder_bytes; else if (this.celltype == celltype_string) this.encoder = encoder_bytes; - else throw new kelondroException("kelondroColumn - encoder missing for cell " + this.nickname); + else throw new kelondroException("kelondroColumn - encoder missing for cell '" + this.nickname + "'"); } } else { if (this.celltype == celltype_cardinal) throw new kelondroException("kelondroColumn - encoder missing for cell " + this.nickname); diff --git a/source/de/anomic/kelondro/index/Row.java b/source/de/anomic/kelondro/index/Row.java index 83b0eca87..107bf1fdc 100644 --- a/source/de/anomic/kelondro/index/Row.java +++ b/source/de/anomic/kelondro/index/Row.java @@ -380,11 +380,11 @@ public final class Row { final Object[] ref = nickref.get(nickname); if (ref == null) return; final Column col = (Column) ref[0]; - setCol(col.encoder, ((Integer) ref[1]).intValue(), col.cellwidth, cell); + setCol(((Integer) ref[1]).intValue(), col.cellwidth, cell); } public final void setCol(final int column, final byte[] cell) { - setCol(row[column].encoder, colstart[column], row[column].cellwidth, cell); + setCol(colstart[column], row[column].cellwidth, cell); } public final void setCol(final int column, final char[] cell) { @@ -393,7 +393,7 @@ public final class Row { for (int i = cell.length; i < row[column].cellwidth; i++) rowinstance[offset + clstrt + i] = 0; } - private final void setCol(final int encoding, final int clstrt, int length, final byte[] cell) { + private final void setCol(final int clstrt, int length, final byte[] cell) { if (cell == null) { while (length-- > 0) rowinstance[offset + clstrt + length] = 0; } else { @@ -411,6 +411,10 @@ public final class Row { rowinstance[offset + colstart[column]] = c; } + public final void setCol(final int column, final String cell) { + setCol(column, cell.getBytes()); + } + public final void setCol(final int column, final String cell, final String encoding) { if (encoding == null) setCol(column, cell.getBytes()); diff --git a/source/de/anomic/kelondro/text/DocumentIndex.java b/source/de/anomic/kelondro/text/DocumentIndex.java index 7b1c58666..2296d2826 100644 --- a/source/de/anomic/kelondro/text/DocumentIndex.java +++ b/source/de/anomic/kelondro/text/DocumentIndex.java @@ -122,7 +122,7 @@ public class DocumentIndex extends Segment { * If the given file is a path to a directory, the complete sub-tree is indexed * @param start */ - public void addAll(File start) { + public void addConcurrent(File start) { assert (start != null); assert (start.canRead()) : start.toString(); if (!start.isDirectory()) { @@ -137,7 +137,7 @@ public class DocumentIndex extends Segment { w = new File(start, t); if (w.canRead() && !w.isHidden()) { if (w.isDirectory()) { - addAll(w); + addConcurrent(w); } else { try { this.queue.put(w); @@ -232,7 +232,7 @@ public class DocumentIndex extends Segment { if (args[1].equals("add")) { File f = new File(args[2]); DocumentIndex di = new DocumentIndex(segmentPath); - di.addAll(f); + di.addConcurrent(f); di.close(); } else { String query = ""; diff --git a/source/de/anomic/kelondro/text/IndexCell.java b/source/de/anomic/kelondro/text/IndexCell.java index 4b56e7773..81f42466d 100644 --- a/source/de/anomic/kelondro/text/IndexCell.java +++ b/source/de/anomic/kelondro/text/IndexCell.java @@ -31,6 +31,7 @@ import java.io.IOException; import java.util.HashSet; import java.util.Set; +import de.anomic.kelondro.index.ARC; import de.anomic.kelondro.index.Row; import de.anomic.kelondro.index.SimpleARC; import de.anomic.kelondro.order.ByteOrder; @@ -64,11 +65,12 @@ public final class IndexCell extends AbstractBu private long lastCleanup; private final long targetFileSize, maxFileSize; private final int writeBufferSize; - private final SimpleARC countCache; + private final ARC countCache; private boolean cleanerRunning = false; public IndexCell( final File cellPath, + final String prefix, final ReferenceFactory factory, final ByteOrder termOrder, final Row payloadrow, @@ -80,7 +82,7 @@ public final class IndexCell extends AbstractBu ) throws IOException { super(factory); - this.array = new ReferenceContainerArray(cellPath, factory, termOrder, payloadrow, merger); + this.array = new ReferenceContainerArray(cellPath, prefix, factory, termOrder, payloadrow, merger); this.ram = new ReferenceContainerCache(factory, payloadrow, termOrder); this.maxRamEntries = maxRamEntries; this.merger = merger; diff --git a/source/de/anomic/kelondro/text/ReferenceContainerArray.java b/source/de/anomic/kelondro/text/ReferenceContainerArray.java index a3c255d95..0c9be8394 100644 --- a/source/de/anomic/kelondro/text/ReferenceContainerArray.java +++ b/source/de/anomic/kelondro/text/ReferenceContainerArray.java @@ -57,6 +57,7 @@ public final class ReferenceContainerArray { */ public ReferenceContainerArray( final File heapLocation, + final String prefix, final ReferenceFactory factory, final ByteOrder termOrder, final Row payloadrow, @@ -65,7 +66,7 @@ public final class ReferenceContainerArray { this.payloadrow = payloadrow; this.array = new ArrayStack( heapLocation, - "index", + prefix, payloadrow.primaryKeyLength, termOrder, 0); @@ -217,6 +218,18 @@ public final class ReferenceContainerArray { return c; } + /** + * calculate an upper limit for a ranking number of the container size + * the returned number is not a counter. It can only be used to compare the + * ReferenceContainer, that may be produced as a result of get() + * @param termHash + * @return a ranking number + * @throws IOException + */ + public long lenghtRankingUpperLimit(final byte[] termHash) throws IOException { + return this.array.lengthAdd(termHash); + } + /** * delete a indexContainer from the heap cache. This can only be used for write-enabled heaps * @param wordHash diff --git a/source/de/anomic/kelondro/text/Segment.java b/source/de/anomic/kelondro/text/Segment.java index 063245d76..79f211718 100644 --- a/source/de/anomic/kelondro/text/Segment.java +++ b/source/de/anomic/kelondro/text/Segment.java @@ -82,6 +82,9 @@ public class Segment { final boolean useTailCache, final boolean exceed134217727) throws IOException { + migrateTextIndex(segmentPath, segmentPath); + migrateTextMetadata(segmentPath, segmentPath); + log.logInfo("Initializing Segment '" + segmentPath + "', word hash cache size is " + Word.hashCacheSize + "."); this.log = log; @@ -89,8 +92,10 @@ public class Segment { this.merger = new IODispatcher(1, 1, writeBufferSize); this.merger.start(); + this.termIndex = new IndexCell( - new File(segmentPath, "RICELL"), + segmentPath, + "text.index", wordReferenceFactory, wordOrder, WordReferenceRow.urlEntryRow, @@ -111,11 +116,35 @@ public class Segment { this.merger, writeBufferSize); */ - File metadatadir = new File(segmentPath, "METADATA"); - if (!metadatadir.exists()) metadatadir.mkdirs(); - + // create LURL-db - urlMetadata = new MetadataRepository(metadatadir, useTailCache, exceed134217727); + urlMetadata = new MetadataRepository(segmentPath, "text.urlmd", useTailCache, exceed134217727); + } + + public static void migrateTextIndex(File oldSegmentPath, File newSegmentPath) { + File oldCellPath = new File(oldSegmentPath, "RICELL"); + if (!oldCellPath.exists()) return; + String[] oldIndexFiles = oldCellPath.list(); + for (String oldIndexFile: oldIndexFiles) { + if (oldIndexFile.startsWith("index.")) { + File newFile = new File(newSegmentPath, "text.index." + oldIndexFile.substring(6)); + new File(oldCellPath, oldIndexFile).renameTo(newFile); + } + } + oldCellPath.delete(); + } + + public static void migrateTextMetadata(File oldSegmentPath, File newSegmentPath) { + File oldMetadataPath = new File(oldSegmentPath, "METADATA"); + if (!oldMetadataPath.exists()) return; + String[] oldMetadataFiles = oldMetadataPath.list(); + for (String oldMetadataFile: oldMetadataFiles) { + if (oldMetadataFile.startsWith("urls.")) { + File newFile = new File(newSegmentPath, "text.urlmd." + oldMetadataFile.substring(5)); + new File(oldMetadataPath, oldMetadataFile).renameTo(newFile); + } + } + oldMetadataPath.delete(); } public MetadataRepository urlMetadata() { @@ -431,5 +460,13 @@ public class Segment { } } } + + public int rwisize() { + return termIndex().sizesMax(); + } + + public int urlsize() { + return urlMetadata().size(); + } } } diff --git a/source/de/anomic/kelondro/text/Segments.java b/source/de/anomic/kelondro/text/Segments.java new file mode 100644 index 000000000..fda6f4432 --- /dev/null +++ b/source/de/anomic/kelondro/text/Segments.java @@ -0,0 +1,226 @@ +// Segments.java +// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 30.07.2009 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2009-05-28 01:51:34 +0200 (Do, 28 Mai 2009) $ +// $LastChangedRevision: 5988 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.kelondro.text; + +import java.io.File; +import java.io.IOException; +import java.util.Date; +import java.util.HashMap; +import java.util.Iterator; + +import de.anomic.document.Condenser; +import de.anomic.document.Document; +import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; +import de.anomic.kelondro.text.referencePrototype.WordReference; +import de.anomic.yacy.yacyURL; +import de.anomic.yacy.logging.Log; + +public final class Segments implements Iterable { + + /** + * process enumeration type + * defines constants that can be used to assign process-related segment names + */ + public enum Process { + + RECEIPTS, + QUERIES, + DHTIN, + DHTOUT, // the only segment that is used for reading-only + PROXY, + LOCALCRAWLING, + REMOTECRAWLING, + PUBLIC; // includes the index that can be retrieved by the yacy p2p api + + public String toString() { + throw new UnsupportedOperationException("toString not allowed"); + } + } + + private final Log log; + private final File segmentsPath; + private final int entityCacheMaxSize; + private final long maxFileSize; + private HashMap segments; + private HashMap process_assignment; + private final boolean useTailCache; + private final boolean exceed134217727; + + public Segments( + final Log log, + final File segmentsPath, + final int entityCacheMaxSize, + final long maxFileSize, + final boolean useTailCache, + final boolean exceed134217727) throws IOException { + this.log = log; + this.segmentsPath = segmentsPath; + this.entityCacheMaxSize = entityCacheMaxSize; + this.maxFileSize = maxFileSize; + this.useTailCache = useTailCache; + this.exceed134217727 = exceed134217727; + this.segments = new HashMap(); + this.process_assignment = new HashMap(); + + // assign default segment names for the processes + this.process_assignment.put(Process.RECEIPTS, "default"); + this.process_assignment.put(Process.QUERIES, "default"); + this.process_assignment.put(Process.DHTIN, "default"); + this.process_assignment.put(Process.DHTOUT, "default"); + this.process_assignment.put(Process.PROXY, "default"); + this.process_assignment.put(Process.LOCALCRAWLING, "default"); + this.process_assignment.put(Process.REMOTECRAWLING, "default"); + this.process_assignment.put(Process.PUBLIC, "default"); + } + + public void setSegment(Process process, String segmentName) { + this.process_assignment.put(process, segmentName); + } + + public static void migrateOld(File oldSingleSegment, File newSegmentsPath, String newSegmentName) { + if (!oldSingleSegment.exists()) return; + File newSegmentPath = new File(newSegmentsPath, newSegmentName); + if (!newSegmentPath.exists()) newSegmentPath.mkdirs(); + Segment.migrateTextIndex(oldSingleSegment, newSegmentPath); + Segment.migrateTextMetadata(oldSingleSegment, newSegmentPath); + + String[] oldFiles = oldSingleSegment.list(); + for (String oldFile: oldFiles) { + if (oldFile.startsWith("text.")) { + new File(oldSingleSegment, oldFile).renameTo(new File(newSegmentPath, oldFile)); + } + } + } + + public String[] segmentNames() { + return this.segments.keySet().toArray(new String[this.segments.size()]); + } + + public boolean segmentExist(final String segmentName) { + return segments.containsKey(segmentName); + } + + public Segment segment(final Process process) { + return segment(this.process_assignment.get(process)); + } + + public Segment segment(final String segmentName) { + Segment segment = segments.get(segmentName); + if (segment == null) { + // generate the segment + try { + segment = new Segment( + this.log, + new File(this.segmentsPath, segmentName), + this.entityCacheMaxSize, + this.maxFileSize, + this.useTailCache, + this.exceed134217727); + } catch (IOException e) { + e.printStackTrace(); + return null; + } + this.segments.put(segmentName, segment); + } + return segment; + } + + public int URLCount() { + int c = 0; + for (Segment s: this.segments.values()) c += s.urlMetadata().size(); + return c; + } + + public int RWICount() { + int c = 0; + for (Segment s: this.segments.values()) c += s.termIndex().sizesMax(); + return c; + } + + public int RWIBufferCount() { + int c = 0; + for (Segment s: this.segments.values()) c += s.termIndex().getBufferSize(); + return c; + } + + public MetadataRepository urlMetadata(final Process process) { + return segment(this.process_assignment.get(process)).urlMetadata(); + } + + public IndexCell termIndex(final Process process) { + return segment(this.process_assignment.get(process)).termIndex(); + } + + public void clear(final Process process) { + segment(this.process_assignment.get(process)).clear(); + } + + public File getLocation(final Process process) { + return segment(this.process_assignment.get(process)).getLocation(); + } + + public void close(final Process process) { + segment(this.process_assignment.get(process)).close(); + } + + public void close() { + if (segments != null) for (Segment s: this.segments.values()) s.close(); + this.segments = null; + } + + public void finalize() { + this.close(); + } + + public URLMetadataRow storeDocument( + final String segmentName, + final yacyURL url, + final yacyURL referrerURL, + final Date docDate, + final long sourcesize, + final Document document, + final Condenser condenser + ) throws IOException { + return segment(segmentName).storeDocument( + url, + referrerURL, + docDate, + sourcesize, + document, + condenser + ); + } + + public synchronized Segment.ReferenceCleaner getReferenceCleaner(final String segmentName, final byte[] startHash) throws IOException { + return segment(segmentName).getReferenceCleaner(startHash); + } + + public Iterator iterator() { + return this.segments.values().iterator(); + } +} + diff --git a/source/de/anomic/kelondro/text/metadataPrototype/URLMetadataRow.java b/source/de/anomic/kelondro/text/metadataPrototype/URLMetadataRow.java index fadb168b1..8fed0dcd4 100644 --- a/source/de/anomic/kelondro/text/metadataPrototype/URLMetadataRow.java +++ b/source/de/anomic/kelondro/text/metadataPrototype/URLMetadataRow.java @@ -156,7 +156,7 @@ public class URLMetadataRow implements Metadata { final int lapp) { // create new entry this.entry = rowdef.newEntry(); - this.entry.setCol(col_hash, url.hash(), null); + this.entry.setCol(col_hash, url.hash()); this.entry.setCol(col_comp, encodeComp(url, dc_title, dc_creator, dc_subject, ETag)); encodeDate(col_mod, mod); encodeDate(col_load, load); @@ -191,11 +191,17 @@ public class URLMetadataRow implements Metadata { private void encodeDate(final int col, final Date d) { // calculates the number of days since 1.1.1970 and returns this as 4-byte array - this.entry.setCol(col, NaturalOrder.encodeLong(d.getTime() / 86400000, 4)); + // 86400000 is the number of milliseconds in one day + this.entry.setCol(col, NaturalOrder.encodeLong(d.getTime() / 86400000L, 4)); } private Date decodeDate(final int col) { - return new Date(86400000 * this.entry.getColLong(col)); + long t = this.entry.getColLong(col); + /*if (t < 14600) */return new Date(86400000L * t); // time was stored as number of days since epoch + /* + if (t < 350400) return new Date(3600000L * t); // hours since epoch + if (t < 21024000) return new Date(60000L * t); // minutes since epoch + */ } public static byte[] encodeComp(final yacyURL url, final String dc_title, final String dc_creator, final String dc_subject, final String ETag) { diff --git a/source/de/anomic/kelondro/util/ByteArray.java b/source/de/anomic/kelondro/util/ByteArray.java index 104802e0a..9e4f5f6a7 100644 --- a/source/de/anomic/kelondro/util/ByteArray.java +++ b/source/de/anomic/kelondro/util/ByteArray.java @@ -60,7 +60,7 @@ public class ByteArray { return buffer[pos]; } - public static boolean equals(final byte[] buffer, final byte[] pattern) { + public static boolean startsWith(final byte[] buffer, final byte[] pattern) { // compares two byte arrays: true, if pattern appears completely at offset position if (buffer == null && pattern == null) return true; if (buffer == null || pattern == null) return false; diff --git a/source/de/anomic/search/ResultFetcher.java b/source/de/anomic/search/ResultFetcher.java index 5bbab43c4..8612c68e6 100644 --- a/source/de/anomic/search/ResultFetcher.java +++ b/source/de/anomic/search/ResultFetcher.java @@ -226,7 +226,7 @@ public class ResultFetcher { registerFailure(page.hash(), "no text snippet for URL " + metadata.url()); if (!peers.mySeed().isVirgin()) try { - TextSnippet.failConsequences(snippet, query.id(false)); + TextSnippet.failConsequences(this.indexSegment, snippet, query.id(false)); } catch (IOException e) { e.printStackTrace(); } diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index ca8c07882..d46ab5f6a 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -160,6 +160,7 @@ import de.anomic.http.server.RobotsTxtConfig; import de.anomic.kelondro.order.Digest; import de.anomic.kelondro.order.NaturalOrder; import de.anomic.kelondro.text.Segment; +import de.anomic.kelondro.text.Segments; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.util.DateFormatter; @@ -231,7 +232,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi public File surrogatesInPath; public File surrogatesOutPath; public Map rankingPermissions; - public Segment indexSegment; + public Segments indexSegments; public LoaderDispatcher loader; public CrawlSwitchboard crawler; public CrawlQueues crawlQueues; @@ -363,9 +364,12 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi partitionExponent, this.useTailCache, this.exceed134217727); - indexSegment = new Segment( + File oldSingleSegment = new File(new File(indexPath, networkName), "TEXT"); + File newSegmentsPath = new File(new File(indexPath, networkName), "SEGMENTS"); + Segments.migrateOld(oldSingleSegment, newSegmentsPath, getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default")); + indexSegments = new Segments( log, - new File(new File(indexPath, networkName), "TEXT"), + newSegmentsPath, wordCacheMaxCount, fileSizeMax, this.useTailCache, @@ -377,8 +381,20 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi this.queuesRoot); } catch (IOException e1) { e1.printStackTrace(); - indexSegment = null; + indexSegments = null; } + + // set the default segment names + indexSegments.setSegment(Segments.Process.RECEIPTS, getConfig(SwitchboardConstants.SEGMENT_RECEIPTS, "default")); + indexSegments.setSegment(Segments.Process.QUERIES, getConfig(SwitchboardConstants.SEGMENT_QUERIES, "default")); + indexSegments.setSegment(Segments.Process.DHTIN, getConfig(SwitchboardConstants.SEGMENT_DHTIN, "default")); + indexSegments.setSegment(Segments.Process.DHTOUT, getConfig(SwitchboardConstants.SEGMENT_DHTOUT, "default")); + indexSegments.setSegment(Segments.Process.PROXY, getConfig(SwitchboardConstants.SEGMENT_PROXY, "default")); + indexSegments.setSegment(Segments.Process.LOCALCRAWLING, getConfig(SwitchboardConstants.SEGMENT_LOCALCRAWLING, "default")); + indexSegments.setSegment(Segments.Process.REMOTECRAWLING, getConfig(SwitchboardConstants.SEGMENT_REMOTECRAWLING, "default")); + indexSegments.setSegment(Segments.Process.PUBLIC, getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default")); + + // init crawl results monitor cache crawlResults = new ResultURLs(); // start yacy core @@ -389,8 +405,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi // init a DHT transmission dispatcher this.dhtDispatcher = new Dispatcher( - indexSegment.termIndex(), - indexSegment.urlMetadata(), + indexSegments.segment(Segments.Process.LOCALCRAWLING), peers, true, 30000); @@ -583,7 +598,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi this.crawlStacker = new CrawlStacker( this.crawlQueues, this.crawler, - this.indexSegment, + this.indexSegments.segment(Segments.Process.LOCALCRAWLING), this.peers, "local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0, "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0); @@ -794,8 +809,8 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi // switch the networks synchronized (this) { // shut down - synchronized (this.indexSegment) { - this.indexSegment.close(); + synchronized (this.indexSegments) { + this.indexSegments.close(); } this.crawlStacker.announceClose(); this.crawlStacker.close(); @@ -832,9 +847,9 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi this.useTailCache, this.exceed134217727); try { - indexSegment = new Segment( + indexSegments = new Segments( log, - new File(new File(indexPrimaryPath, networkName), "TEXT"), + new File(new File(indexPrimaryPath, networkName), "SEGMENTS"), wordCacheMaxCount, fileSizeMax, this.useTailCache, @@ -868,11 +883,10 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi this.getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()), this.getConfigLong("minimumGlobalDelta", this.crawlQueues.noticeURL.getMinimumGlobalDelta())); - // we need a new stacker, because this uses network-specific attributes to sort out urls (local, global) this.crawlStacker = new CrawlStacker( this.crawlQueues, this.crawler, - this.indexSegment, + this.indexSegments.segment(Segments.Process.LOCALCRAWLING), this.peers, "local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0, "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0); @@ -1007,26 +1021,32 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi } } - public String urlExists(final String hash) { + public String urlExists(Segments.Process process, final String hash) { // tests if hash occurrs in any database // if it exists, the name of the database is returned, // if it not exists, null is returned - if (indexSegment.urlMetadata().exists(hash)) return "loaded"; + if (indexSegments.urlMetadata(process).exists(hash)) return "loaded"; return this.crawlQueues.urlExists(hash); } - public void urlRemove(final String hash) { - indexSegment.urlMetadata().remove(hash); + public void urlRemove(Segment segment, final String hash) { + segment.urlMetadata().remove(hash); crawlResults.remove(hash); crawlQueues.urlRemove(hash); } - public yacyURL getURL(final String urlhash) { + public void urlRemove(Segments.Process process, final String hash) { + indexSegments.urlMetadata(process).remove(hash); + crawlResults.remove(hash); + crawlQueues.urlRemove(hash); + } + + public yacyURL getURL(Segments.Process process, final String urlhash) { if (urlhash == null) return null; if (urlhash.length() == 0) return null; final yacyURL ne = crawlQueues.getURL(urlhash); if (ne != null) return ne; - final URLMetadataRow le = indexSegment.urlMetadata().load(urlhash, null, 0); + final URLMetadataRow le = indexSegments.urlMetadata(process).load(urlhash, null, 0); if (le != null) return le.metadata().url(); return null; } @@ -1120,7 +1140,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi crawlQueues.close(); crawler.close(); log.logConfig("SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager (stand by...)"); - indexSegment.close(); + indexSegments.close(); peers.close(); Cache.close(); UPnP.deletePortMapping(); @@ -1187,7 +1207,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi // put document into the concurrent processing queue if (log.isFinest()) log.logFinest("deQueue: passing to indexing queue: " + response.url().toNormalform(true, false)); try { - this.indexingDocumentProcessor.enQueue(new indexingQueueEntry(response, null, null)); + this.indexingDocumentProcessor.enQueue(new indexingQueueEntry(Segments.Process.LOCALCRAWLING, response, null, null)); return null; } catch (InterruptedException e) { e.printStackTrace(); @@ -1232,7 +1252,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi 0 ); response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile); - indexingQueueEntry queueEntry = new indexingQueueEntry(response, document, null); + indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.LOCALCRAWLING, response, document, null); // place the queue entry into the concurrent process of the condenser (document analysis) try { @@ -1300,14 +1320,17 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi } public static class indexingQueueEntry extends serverProcessorJob { + public Segments.Process process; public Response queueEntry; public Document document; public Condenser condenser; public indexingQueueEntry( + final Segments.Process process, final Response queueEntry, final Document document, final Condenser condenser) { super(); + this.process = process; this.queueEntry = queueEntry; this.document = document; this.condenser = condenser; @@ -1330,7 +1353,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi // clear caches if necessary if (!MemoryControl.request(8000000L, false)) { - indexSegment.urlMetadata().clearCache(); + for (Segment indexSegment: this.indexSegments) indexSegment.urlMetadata().clearCache(); SearchEventCache.cleanupEvents(true); } @@ -1569,7 +1592,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi if (document == null) { return null; } - return new indexingQueueEntry(in.queueEntry, document, null); + return new indexingQueueEntry(in.process, in.queueEntry, document, null); } private Document parseDocument(Response entry) throws InterruptedException { @@ -1679,7 +1702,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi final CrawlProfile.entry profile = in.queueEntry.profile(); ResultImages.registerImages(in.document, (profile == null) ? true : !profile.remoteIndexing()); - return new indexingQueueEntry(in.queueEntry, in.document, condenser); + return new indexingQueueEntry(in.process, in.queueEntry, in.document, condenser); } catch (final UnsupportedEncodingException e) { return null; } @@ -1693,11 +1716,11 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi public void storeDocumentIndex(final indexingQueueEntry in) { in.queueEntry.updateStatus(Response.QUEUE_STATE_INDEXSTORAGE); - storeDocumentIndex(in.queueEntry, in.document, in.condenser); + storeDocumentIndex(in.process, in.queueEntry, in.document, in.condenser); in.queueEntry.updateStatus(Response.QUEUE_STATE_FINISHED); } - private void storeDocumentIndex(final Response queueEntry, final Document document, final Condenser condenser) { + private void storeDocumentIndex(Segments.Process process, final Response queueEntry, final Document document, final Condenser condenser) { // CREATE INDEX final String dc_title = document.dc_title(); @@ -1710,7 +1733,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi // STORE URL TO LOADED-URL-DB URLMetadataRow newEntry = null; try { - newEntry = indexSegment.storeDocument( + newEntry = indexSegments.segment(process).storeDocument( queueEntry.url(), referrerURL, queueEntry.lastModified(), @@ -1726,10 +1749,10 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi // update url result list statistics crawlResults.stack( - newEntry, // loaded url db entry - queueEntry.initiator(), // initiator peer hash + newEntry, // loaded url db entry + queueEntry.initiator(), // initiator peer hash this.peers.mySeed().hash, // executor peer hash - processCase // process case + processCase // process case ); // STORE WORD INDEX @@ -1801,11 +1824,11 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi } // method for index deletion - public int removeAllUrlReferences(final yacyURL url, final boolean fetchOnline) { - return removeAllUrlReferences(url.hash(), fetchOnline); + public int removeAllUrlReferences(Segment indexSegment, final yacyURL url, final boolean fetchOnline) { + return removeAllUrlReferences(indexSegment, url.hash(), fetchOnline); } - public int removeAllUrlReferences(final String urlhash, final boolean fetchOnline) { + public int removeAllUrlReferences(Segment indexSegment, final String urlhash, final boolean fetchOnline) { // find all the words in a specific resource and remove the url reference from every word index // finally, delete the url entry @@ -1937,7 +1960,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi return accessSet.tailSet(Long.valueOf(System.currentTimeMillis() - timeInterval)).size(); } - public String dhtShallTransfer() { + public String dhtShallTransfer(String segment) { String cautionCause = onlineCaution(); if (cautionCause != null) { return "online caution for " + cautionCause + ", dht transmission"; @@ -1960,6 +1983,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi if (getConfig(SwitchboardConstants.INDEX_DIST_ALLOW, "false").equalsIgnoreCase("false")) { return "no DHT distribution: not enabled (per setting)"; } + Segment indexSegment = this.indexSegments.segment(segment); if (indexSegment.urlMetadata().size() < 10) { return "no DHT distribution: loadedURL.size() = " + indexSegment.urlMetadata().size(); } @@ -1974,9 +1998,13 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi } return null; // this means; yes, please do dht transfer } - + public boolean dhtTransferJob() { - final String rejectReason = dhtShallTransfer(); + return dhtTransferJob(getConfig(SwitchboardConstants.SEGMENT_DHTOUT, "default")); + } + + public boolean dhtTransferJob(String segment) { + final String rejectReason = dhtShallTransfer(segment); if (rejectReason != null) { if (this.log.isFine()) log.logFine(rejectReason); return false; @@ -2073,10 +2101,10 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi peers.mySeed().put(yacySeed.RSPEED, Double.toString(totalQPM /*Math.max((float) requestcdiff, 0f) * 60f / Math.max((float) uptimediff, 1f)*/ )); peers.mySeed().put(yacySeed.UPTIME, Long.toString(uptime/60)); // the number of minutes that the peer is up in minutes/day (moving average MA30) - peers.mySeed().put(yacySeed.LCOUNT, Integer.toString(indexSegment.urlMetadata().size())); // the number of links that the peer has stored (LURL's) + peers.mySeed().put(yacySeed.LCOUNT, Integer.toString(indexSegments.URLCount())); // the number of links that the peer has stored (LURL's) peers.mySeed().put(yacySeed.NCOUNT, Integer.toString(crawlQueues.noticeURL.size())); // the number of links that the peer has noticed, but not loaded (NURL's) peers.mySeed().put(yacySeed.RCOUNT, Integer.toString(crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT))); // the number of links that the peer provides for remote crawling (ZURL's) - peers.mySeed().put(yacySeed.ICOUNT, Integer.toString(indexSegment.termIndex().sizesMax())); // the minimum number of words that the peer has indexed (as it says) + peers.mySeed().put(yacySeed.ICOUNT, Integer.toString(indexSegments.RWICount())); // the minimum number of words that the peer has indexed (as it says) peers.mySeed().put(yacySeed.SCOUNT, Integer.toString(peers.sizeConnected())); // the number of seeds that the peer has stored peers.mySeed().put(yacySeed.CCOUNT, Double.toString(((int) ((peers.sizeConnected() + peers.sizeDisconnected() + peers.sizePotential()) * 60.0 / (uptime + 1.01)) * 100) / 100.0)); // the number of clients that the peer connects (as connects/hour) peers.mySeed().put(yacySeed.VERSION, yacyBuildProperties.getLongVersion()); diff --git a/source/de/anomic/search/SwitchboardConstants.java b/source/de/anomic/search/SwitchboardConstants.java index ddce48dcc..5d84e4e1d 100644 --- a/source/de/anomic/search/SwitchboardConstants.java +++ b/source/de/anomic/search/SwitchboardConstants.java @@ -437,4 +437,16 @@ public final class SwitchboardConstants { public static final String TRAY_ICON_FORCED = "trayIcon.force"; public static final String TRAY_LABEL = "tray.label"; public static final String BROWSERINTEGRATION = "browserintegration"; + + /** + * Segments + */ + public static final String SEGMENT_RECEIPTS = "segment.process.receipts_tmp"; + public static final String SEGMENT_QUERIES = "segment.process.queries_tmp"; + public static final String SEGMENT_DHTIN = "segment.process.dhtin_tmp"; + public static final String SEGMENT_DHTOUT = "segment.process.dhtout_tmp"; + public static final String SEGMENT_PROXY = "segment.process.proxy_tmp"; + public static final String SEGMENT_LOCALCRAWLING = "segment.process.localcrawling_tmp"; + public static final String SEGMENT_REMOTECRAWLING= "segment.process.remotecrawling_tmp"; + public static final String SEGMENT_PUBLIC = "segment.process.public_tmp"; } diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java index 57fd409cf..8718d39a0 100644 --- a/source/de/anomic/search/TextSnippet.java +++ b/source/de/anomic/search/TextSnippet.java @@ -46,6 +46,7 @@ import de.anomic.http.metadata.ResponseHeader; import de.anomic.kelondro.index.ARC; import de.anomic.kelondro.index.ConcurrentARC; import de.anomic.kelondro.order.Base64Order; +import de.anomic.kelondro.text.Segment; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.SetTools; import de.anomic.yacy.yacySearch; @@ -571,7 +572,7 @@ public class TextSnippet { } } - public static String failConsequences(final TextSnippet snippet, final String eventID) throws IOException { + public static String failConsequences(Segment indexSegment, final TextSnippet snippet, final String eventID) throws IOException { // problems with snippet fetch final String urlHash = snippet.getUrl().hash(); final String querystring = SetTools.setToString(snippet.getRemainingHashes(), ' '); @@ -580,18 +581,17 @@ public class TextSnippet { (snippet.getErrorCode() == ERROR_PARSER_FAILED) || (snippet.getErrorCode() == ERROR_PARSER_NO_LINES)) { log.logInfo("error: '" + snippet.getError() + "', remove url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError()); - Switchboard.getSwitchboard().indexSegment.urlMetadata().remove(urlHash); + indexSegment.urlMetadata().remove(urlHash); final SearchEvent event = SearchEventCache.getEvent(eventID); - assert Switchboard.getSwitchboard() != null; - assert Switchboard.getSwitchboard().indexSegment != null; + assert indexSegment != null; assert event != null : "eventID = " + eventID; assert event.getQuery() != null; - Switchboard.getSwitchboard().indexSegment.termIndex().remove(event.getQuery().queryHashes, urlHash); + indexSegment.termIndex().remove(event.getQuery().queryHashes, urlHash); event.remove(urlHash); } if (snippet.getErrorCode() == ERROR_NO_MATCH) { log.logInfo("error: '" + snippet.getError() + "', remove words '" + querystring + "' for url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError()); - Switchboard.getSwitchboard().indexSegment.termIndex().remove(snippet.getRemainingHashes(), urlHash); + indexSegment.termIndex().remove(snippet.getRemainingHashes(), urlHash); SearchEventCache.getEvent(eventID).remove(urlHash); } return snippet.getError(); diff --git a/source/de/anomic/search/blockrank/CRProcess.java b/source/de/anomic/search/blockrank/CRProcess.java index 0919d16f2..b46f41385 100644 --- a/source/de/anomic/search/blockrank/CRProcess.java +++ b/source/de/anomic/search/blockrank/CRProcess.java @@ -256,6 +256,7 @@ public class CRProcess { newacc = new Table(new File(path, CRG_accname), CRG_accrow, 0, 0, true, false); newseq = new IndexCell( path, + "index", Segment.wordReferenceFactory, Base64Order.enhancedCoder, CRG_colrow, @@ -390,9 +391,9 @@ public class CRProcess { public static int genrcix(final File cr_path_in, final File rci_path_out) throws IOException { //kelondroFlexTable acc = new kelondroFlexTable(cr_path_in, CRG_accname, kelondroBase64Order.enhancedCoder, 128 * 1024 * 1024, -1, CRG_accrow, true); final IndexCell seq = new IndexCell( - cr_path_in, Segment.wordReferenceFactory, Base64Order.enhancedCoder, CRG_colrow, 10000, 1000000000L, 20, null, 1000000); + cr_path_in, "index", Segment.wordReferenceFactory, Base64Order.enhancedCoder, CRG_colrow, 10000, 1000000000L, 20, null, 1000000); final IndexCell rci = new IndexCell( - rci_path_out, Segment.wordReferenceFactory, Base64Order.enhancedCoder, RCI_coli, 10000, 1000000000L, 20, null, 1000000); + rci_path_out, "index", Segment.wordReferenceFactory, Base64Order.enhancedCoder, RCI_coli, 10000, 1000000000L, 20, null, 1000000); // loop over all referees int count = 0; diff --git a/source/de/anomic/yacy/dht/Dispatcher.java b/source/de/anomic/yacy/dht/Dispatcher.java index 59f58adce..0fa5592bb 100755 --- a/source/de/anomic/yacy/dht/Dispatcher.java +++ b/source/de/anomic/yacy/dht/Dispatcher.java @@ -33,9 +33,7 @@ import java.util.LinkedHashMap; import java.util.Map; import de.anomic.kelondro.order.Base64Order; -import de.anomic.kelondro.text.BufferedIndex; import de.anomic.kelondro.text.ReferenceContainer; -import de.anomic.kelondro.text.MetadataRepository; import de.anomic.kelondro.text.Segment; import de.anomic.kelondro.text.referencePrototype.WordReference; import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; @@ -83,8 +81,8 @@ public class Dispatcher { // the String-key is the primary target as contained in the Entry private Map transmissionCloud; - // the backend is used to store the remaining indexContainers in case that the object is closed - private BufferedIndex backend; + // the segment backend is used to store the remaining indexContainers in case that the object is closed + private Segment segment; // the seed database private yacySeedDB seeds; @@ -99,21 +97,19 @@ public class Dispatcher { private Transmission transmission; public Dispatcher( - final BufferedIndex backend, - final MetadataRepository repository, + final Segment segment, final yacySeedDB seeds, final boolean gzipBody, final int timeout ) { this.transmissionCloud = new LinkedHashMap(); - this.backend = backend; + this.segment = segment; this.seeds = seeds; this.log = new Log("INDEX-TRANSFER-DISPATCHER"); this.transmission = new Transmission( log, - repository, + segment, seeds, - backend, gzipBody, timeout); //this.selectedContainerCache = null; @@ -171,7 +167,7 @@ public class Dispatcher { final ArrayList> containers = new ArrayList>(maxContainerCount); - final Iterator> indexContainerIterator = this.backend.references(hash, true, ram); + final Iterator> indexContainerIterator = this.segment.termIndex().references(hash, true, ram); ReferenceContainer container; int refcount = 0; @@ -204,7 +200,7 @@ public class Dispatcher { urlHashes.add(it.next().metadataHash()); } if (this.log.isFine()) this.log.logFine("selected " + urlHashes.size() + " urls for word '" + c.getTermHashAsString() + "'"); - if (urlHashes.size() > 0) this.backend.remove(c.getTermHash(), urlHashes); + if (urlHashes.size() > 0) this.segment.termIndex().remove(c.getTermHash(), urlHashes); } rc = containers; } else { @@ -212,7 +208,7 @@ public class Dispatcher { // but to avoid race conditions return the results from the deletes rc = new ArrayList>(containers.size()); for (ReferenceContainer c: containers) { - container = this.backend.delete(c.getTermHash()); + container = this.segment.termIndex().delete(c.getTermHash()); if (this.log.isFine()) this.log.logFine("selected " + container.size() + " urls for word '" + c.getTermHashAsString() + "'"); if (container.size() != 0) rc.add(container); } @@ -405,7 +401,7 @@ public class Dispatcher { if (indexingTransmissionProcessor != null) this.indexingTransmissionProcessor.announceShutdown(); if (this.transmissionCloud != null) { for (Map.Entry e : this.transmissionCloud.entrySet()) { - for (ReferenceContainer i : e.getValue()) try {this.backend.add(i);} catch (IOException e1) {} + for (ReferenceContainer i : e.getValue()) try {this.segment.termIndex().add(i);} catch (IOException e1) {} } this.transmissionCloud.clear(); } diff --git a/source/de/anomic/yacy/dht/Transmission.java b/source/de/anomic/yacy/dht/Transmission.java index e1b80199d..1d1b29d33 100644 --- a/source/de/anomic/yacy/dht/Transmission.java +++ b/source/de/anomic/yacy/dht/Transmission.java @@ -31,10 +31,8 @@ import java.util.HashSet; import java.util.Iterator; import de.anomic.kelondro.index.Row; -import de.anomic.kelondro.text.Index; import de.anomic.kelondro.text.ReferenceContainer; import de.anomic.kelondro.text.ReferenceContainerCache; -import de.anomic.kelondro.text.MetadataRepository; import de.anomic.kelondro.text.Segment; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.text.referencePrototype.WordReference; @@ -47,23 +45,20 @@ import de.anomic.yacy.logging.Log; public class Transmission { protected Log log; - protected MetadataRepository repository; + protected Segment segment; protected yacySeedDB seeds; protected boolean gzipBody4Transfer; protected int timeout4Transfer; - protected Index backend; public Transmission( Log log, - MetadataRepository repository, + Segment segment, yacySeedDB seeds, - Index backend, boolean gzipBody4Transfer, int timeout4Transfer) { this.log = log; - this.repository = repository; + this.segment = segment; this.seeds = seeds; - this.backend = backend; this.gzipBody4Transfer = gzipBody4Transfer; this.timeout4Transfer = timeout4Transfer; } @@ -131,7 +126,7 @@ public class Transmission { notFound.add(e.metadataHash()); continue; } - URLMetadataRow r = repository.load(e.metadataHash(), null, 0); + URLMetadataRow r = segment.urlMetadata().load(e.metadataHash(), null, 0); if (r == null) { notFound.add(e.metadataHash()); badReferences.add(e.metadataHash()); @@ -251,7 +246,7 @@ public class Transmission { } public void restore() { - for (ReferenceContainer ic : this) try { backend.add(ic); } catch (IOException e) {} + for (ReferenceContainer ic : this) try { segment.termIndex().add(ic); } catch (IOException e) {} } } } diff --git a/source/yacy.java b/source/yacy.java index c813ffae1..1a951835e 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -649,10 +649,10 @@ public final class yacy { log.logInfo("STARTING URL CLEANUP"); // db containing all currently loades urls - final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexPrimaryRoot, networkName), "TEXT"), false, false); + final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexPrimaryRoot, networkName), "TEXT"), "text.urlmd", false, false); // db used to hold all neede urls - final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(new File(indexRoot2, networkName), "TEXT"), false, false); + final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(new File(indexRoot2, networkName), "TEXT"), "text.urlmd", false, false); final int cacheMem = (int)(MemoryControl.maxMemory - MemoryControl.total()); if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up."); @@ -835,7 +835,7 @@ public final class yacy { final File root = homePath; final File indexroot = new File(root, "DATA/INDEX"); try {Log.configureLogging(homePath, new File(homePath, "DATA/LOG/yacy.logging"));} catch (final Exception e) {} - final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexroot, networkName), "TEXT"), false, false); + final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexroot, networkName), "TEXT"), "text.urlmd", false, false); currentUrlDB.deadlinkCleaner(null); currentUrlDB.close(); }