* added index segments

This is a major change in the organization of indexes.
Please consider a back-up of your data before you run this update.
All existing index files will be moved and renamed to a new position.
With this change, it will be possible to maintain different indexes for different purposes and it will be possible to have a distinction between DHT-in and DHT-out specific indexes. Tenants may also have their own index, and it may be possible to have histories and back-ups of indexes. This is just the beginning, many servlets must be adopted after this change, but all functions that had been there should still work.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6389 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 09de5da74a
commit 735e2737e3

@ -3,7 +3,7 @@ javacSource=1.5
javacTarget=1.5
# Release Configuration
releaseVersion=0.91
releaseVersion=0.92
stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
sourceReleaseFile=yacy_src_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseFileParentDir=yacy

@ -914,7 +914,18 @@ content.phpbb3.tableprefix = phpbb_
content.phpbb3.dbuser = notroot
content.phpbb3.dbpw = joshua
content.phpbb3.ppf = 1000
content.phpbb3.dumpfile =
content.phpbb3.dumpfile =
# segment assignment for index storage processes in YaCy:
# each process can store its index result in it's own index segment
segment.process.receipts_tmp = default
segment.process.queries_tmp = default
segment.process.dhtin_tmp = default
segment.process.dhtout_tmp = default
segment.process.proxy_tmp = default
segment.process.localcrawling_tmp = default
segment.process.remotecrawling_tmp = default
segment.process.default_tmp = default
# search engine teaser: an about box in search results
# this is only shown, if the about.body is filled

@ -75,7 +75,7 @@ public class BlacklistCleaner_p {
listManager.listsPath = new File(env.getRootPath(), env.getConfig("listManager.listsPath", "DATA/LISTS"));
String blacklistToUse = null;
// getting the list of supported blacklist types
// get the list of supported blacklist types
final String supportedBlacklistTypesStr = AbstractBlacklist.BLACKLIST_TYPES_STRING;
final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(",");
@ -290,7 +290,7 @@ public class BlacklistCleaner_p {
if (list != null){
// getting rid of escape characters which make it impossible to
// get rid of escape characters which make it impossible to
// properly use contains()
if (s.contains("\\\\")) {
s = s.replaceAll(Pattern.quote("\\\\"), Matcher.quoteReplacement("\\"));

@ -64,11 +64,11 @@ public class Blacklist_p {
listManager.switchboard = (Switchboard) env;
listManager.listsPath = new File(listManager.switchboard.getRootPath(),listManager.switchboard.getConfig("listManager.listsPath", "DATA/LISTS"));
// getting the list of supported blacklist types
// get the list of supported blacklist types
final String supportedBlacklistTypesStr = AbstractBlacklist.BLACKLIST_TYPES_STRING;
final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(",");
// loading all blacklist files located in the directory
// load all blacklist files located in the directory
List<String> dirlist = listManager.getDirListing(listManager.listsPath, BLACKLIST_FILENAME_FILTER);
String blacklistToUse = null;

@ -328,14 +328,14 @@ public class BlogComments {
try {
if (!Boolean.valueOf(sb.getConfig("msgForwardingEnabled","false")).booleanValue()) return;
// getting the recipient address
// get the recipient address
final String sendMailTo = sb.getConfig("msgForwardingTo","root@localhost").trim();
// getting the sendmail configuration
// get the sendmail configuration
final String sendMailStr = sb.getConfig("msgForwardingCmd","/usr/bin/sendmail")+" "+sendMailTo;
final String[] sendMail = sendMailStr.trim().split(" ");
// building the message text
// build the message text
final StringBuilder emailTxt = new StringBuilder();
emailTxt.append("To: ")
.append(sendMailTo)

@ -43,6 +43,7 @@ import de.anomic.data.userDB;
import de.anomic.data.bookmarksDB.Tag;
import de.anomic.document.Document;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.search.Switchboard;
@ -183,7 +184,7 @@ public class Bookmarks {
final bookmarksDB.Bookmark bookmark = sb.bookmarksDB.getBookmark(urlHash);
if (bookmark == null) {
// try to get the bookmark from the LURL database
final URLMetadataRow urlentry = sb.indexSegment.urlMetadata().load(urlHash, null, 0);
final URLMetadataRow urlentry = sb.indexSegments.urlMetadata(Segments.Process.PUBLIC).load(urlHash, null, 0);
Document document = null;
if (urlentry != null) {
final URLMetadataRow.Components metadata = urlentry.metadata();

@ -88,19 +88,19 @@ public final class Connections_p {
for ( int currentThreadIdx = 0; currentThreadIdx < count; currentThreadIdx++ ) {
final Thread t = threadList[currentThreadIdx];
if ((t != null) && (t instanceof serverCore.Session) && (t.isAlive())) {
// getting the session object
// get the session object
final Session s = ((Session) t);
// getting the session runtime
// get the session runtime
final long sessionTime = s.getTime();
// getting the request command line
// getthe request command line
boolean blockingRequest = false;
String commandLine = s.getCommandLine();
if (commandLine == null) blockingRequest = true;
final int commandCount = s.getCommandCount();
// getting the source ip address and port
// get the source ip address and port
final InetAddress userAddress = s.getUserAddress();
final int userPort = s.getUserPort();
if (userAddress == null) continue;
@ -113,13 +113,13 @@ public final class Connections_p {
if (cmdObj instanceof HTTPDemon) {
prot = isSSL ? "https":"http";
// getting the http command object
// get the http command object
final HTTPDemon currentHttpd = (HTTPDemon)cmdObj;
// getting the connection properties of this session
// get the connection properties of this session
final Properties conProp = (Properties) currentHttpd.getConProp().clone();
// getting the destination host
// get the destination host
dest = conProp.getProperty(HeaderFramework.CONNECTION_PROP_HOST);
if (dest==null)continue;
}

@ -31,6 +31,7 @@ import java.util.Iterator;
import java.util.Locale;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
@ -104,7 +105,7 @@ public class CrawlResults {
final String hash = post.get("hash", null);
if (hash != null) {
// delete from database
sb.indexSegment.urlMetadata().remove(hash);
sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).remove(hash);
}
}
@ -114,7 +115,7 @@ public class CrawlResults {
if (hashpart != null) {
// delete all urls for this domain from database
try {
sb.indexSegment.urlMetadata().deleteDomain(hashpart);
sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).deleteDomain(hashpart);
sb.crawlResults.deleteDomain(tabletype, domain, hashpart);
} catch (IOException e) {
e.printStackTrace();
@ -178,7 +179,7 @@ public class CrawlResults {
executorHash = sb.crawlResults.getExecutorHash(tabletype, i);
urlHash = sb.crawlResults.getUrlHash(tabletype, i);
try {
urle = sb.indexSegment.urlMetadata().load(urlHash, null, 0);
urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(urlHash, null, 0);
if(urle == null) {
Log.logWarning("PLASMA", "CrawlResults: URL not in index for crawl result "+ i +" with hash "+ urlHash);
urlstr = null;

@ -40,6 +40,11 @@
#(/rwidb)#
<p>
URL-DB-Cleaner - Clean up the database by deletion of blacklisted urls:<br />
<select name="selectSegment" size="1">
#{segments}#
<option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option>
#{/segments}#
</select>
<a href="IndexCleaner_p.html?action=ustart">Start/Resume</a>
<a href="IndexCleaner_p.html?action=ustop">Stop</a>
<a href="IndexCleaner_p.html?action=upause">Pause</a>

@ -27,6 +27,7 @@
import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.text.MetadataRepository;
import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.Segments;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -39,11 +40,23 @@ public class IndexCleaner_p {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard) env;
prop.put("title", "DbCleanup_p");
// get segment
Segment indexSegment = null;
if (post != null && post.containsKey("segment")) {
String segmentName = post.get("segment");
if (sb.indexSegments.segmentExist(segmentName)) {
indexSegment = sb.indexSegments.segment(segmentName);
}
} else {
// take default segment
indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
}
if (post!=null) {
//prop.putHTML("bla", "post!=null");
if (post.get("action").equals("ustart")) {
if (urldbCleanerThread==null || !urldbCleanerThread.isAlive()) {
urldbCleanerThread = sb.indexSegment.urlMetadata().getBlacklistCleaner(Switchboard.urlBlacklist);
urldbCleanerThread = indexSegment.urlMetadata().getBlacklistCleaner(Switchboard.urlBlacklist);
urldbCleanerThread.start();
}
else {
@ -58,7 +71,7 @@ public class IndexCleaner_p {
}
else if (post.get("action").equals("rstart")) {
if (indexCleanerThread==null || !indexCleanerThread.isAlive()) {
indexCleanerThread = sb.indexSegment.getReferenceCleaner(post.get("wordHash","AAAAAAAAAAAA").getBytes());
indexCleanerThread = indexSegment.getReferenceCleaner(post.get("wordHash","AAAAAAAAAAAA").getBytes());
indexCleanerThread.start();
}
else {
@ -74,10 +87,9 @@ public class IndexCleaner_p {
prop.put("LOCATION","");
return prop;
}
//prop.put("bla", "post==null");
if (urldbCleanerThread!=null) {
prop.put("urldb", "1");
prop.putNum("urldb_percentUrls", ((double)urldbCleanerThread.totalSearchedUrls/sb.indexSegment.urlMetadata().size())*100);
prop.putNum("urldb_percentUrls", ((double)urldbCleanerThread.totalSearchedUrls/indexSegment.urlMetadata().size())*100);
prop.putNum("urldb_blacklisted", urldbCleanerThread.blacklistedUrls);
prop.putNum("urldb_total", urldbCleanerThread.totalSearchedUrls);
prop.putHTML("urldb_lastBlacklistedUrl", urldbCleanerThread.lastBlacklistedUrl);
@ -94,7 +106,7 @@ public class IndexCleaner_p {
prop.put("rwidb_threadAlive", indexCleanerThread.isAlive() + "");
prop.put("rwidb_threadToString", indexCleanerThread.toString());
prop.putNum("rwidb_RWIcountstart", indexCleanerThread.rwiCountAtStart);
prop.putNum("rwidb_RWIcountnow", sb.indexSegment.termIndex().sizesMax());
prop.putNum("rwidb_RWIcountnow", indexCleanerThread.rwisize());
prop.put("rwidb_wordHashNow", (indexCleanerThread.wordHashNow == null) ? "NULL" : new String(indexCleanerThread.wordHashNow));
prop.put("rwidb_lastWordHash", (indexCleanerThread.lastWordHash == null) ? "null" : new String(indexCleanerThread.lastWordHash));
prop.putNum("rwidb_lastDeletionCounter", indexCleanerThread.lastDeletionCounter);

@ -12,6 +12,15 @@
<form action="IndexControlRWIs_p.html" method="post" enctype="multipart/form-data">
<fieldset><legend>RWI Retrieval (= search for a single word)</legend>
<dl>
<dt class="TableCellDark">Select Segment:</dt>
<dd>
<select name="selectSegment" size="1">
#{segments}#
<option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option>
#{/segments}#
</select>
</dd>
<dt class="TableCellDark">Retrieve by Word:</dt>
<dd><input type="text" name="keystring" value="#[keystring]#" size="40" maxlength="80" />
<input type="submit" name="keystringsearch" value="Show URL Entries for Word" />

@ -55,6 +55,7 @@ import de.anomic.search.QueryParams;
import de.anomic.search.RankingProcess;
import de.anomic.search.SearchEventCache;
import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyClient;
@ -69,10 +70,20 @@ public class IndexControlRWIs_p {
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
// set default values
prop.putHTML("keystring", "");
prop.put("keyhash", "");
prop.put("result", "");
String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default");
int i = 0;
for (String s: sb.indexSegments.segmentNames()) {
prop.put("segments_" + i + "_name", s);
prop.put("segments_" + i + "_selected", (segmentName.equals(s)) ? 1 : 0);
i++;
}
Segment segment = sb.indexSegments.segment(segmentName);
prop.put("segments", i);
// switch off all optional forms/lists
prop.put("searchresult", 0);
prop.put("keyhashsimilar", 0);
@ -83,6 +94,16 @@ public class IndexControlRWIs_p {
if (post != null) {
// default values
segmentName = post.get("segment", segmentName).trim();
i= 0;
for (String s: sb.indexSegments.segmentNames()) {
prop.put("segments_" + i + "_name", s);
prop.put("segments_" + i + "_selected", (segmentName.equals(s)) ? 1 : 0);
i++;
}
prop.put("segments", i);
segment = sb.indexSegments.segment(segmentName);
final String keystring = post.get("keystring", "").trim();
byte[] keyhash = post.get("keyhash", "").trim().getBytes();
prop.putHTML("keystring", keystring);
@ -96,7 +117,7 @@ public class IndexControlRWIs_p {
if (post.containsKey("keystringsearch")) {
keyhash = Word.word2hash(keystring);
prop.put("keyhash", keyhash);
final RankingProcess ranking = genSearchresult(prop, sb, keyhash, null);
final RankingProcess ranking = genSearchresult(prop, sb, segment, keyhash, null);
if (ranking.filteredCount() == 0) {
prop.put("searchresult", 1);
prop.putHTML("searchresult_word", keystring);
@ -107,7 +128,7 @@ public class IndexControlRWIs_p {
if (keystring.length() == 0 || !new String(Word.word2hash(keystring)).equals(new String(keyhash))) {
prop.put("keystring", "&lt;not possible to compute word from hash&gt;");
}
final RankingProcess ranking = genSearchresult(prop, sb, keyhash, null);
final RankingProcess ranking = genSearchresult(prop, sb, segment, keyhash, null);
if (ranking.filteredCount() == 0) {
prop.put("searchresult", 2);
prop.putHTML("searchresult_wordhash", new String(keyhash));
@ -116,7 +137,7 @@ public class IndexControlRWIs_p {
// delete everything
if (post.containsKey("deletecomplete") && post.containsKey("confirmDelete")) {
sb.indexSegment.clear();
segment.clear();
sb.crawlQueues.clear();
sb.crawlStacker.clear();
try {
@ -132,9 +153,9 @@ public class IndexControlRWIs_p {
if (delurl || delurlref) {
// generate an urlx array
ReferenceContainer<WordReference> index = null;
index = sb.indexSegment.termIndex().get(keyhash, null);
index = segment.termIndex().get(keyhash, null);
final Iterator<WordReference> en = index.entries();
int i = 0;
i = 0;
urlx = new String[index.size()];
while (en.hasNext()) {
urlx[i++] = en.next().metadataHash();
@ -142,14 +163,14 @@ public class IndexControlRWIs_p {
index = null;
}
if (delurlref) {
for (int i = 0; i < urlx.length; i++) sb.removeAllUrlReferences(urlx[i], true);
for (i = 0; i < urlx.length; i++) sb.removeAllUrlReferences(segment, urlx[i], true);
}
if (delurl || delurlref) {
for (int i = 0; i < urlx.length; i++) {
sb.urlRemove(urlx[i]);
for (i = 0; i < urlx.length; i++) {
sb.urlRemove(segment, urlx[i]);
}
}
sb.indexSegment.termIndex().delete(keyhash);
segment.termIndex().delete(keyhash);
post.remove("keyhashdeleteall");
post.put("urllist", "generated");
} catch (IOException e) {
@ -159,16 +180,16 @@ public class IndexControlRWIs_p {
// delete selected URLs
if (post.containsKey("keyhashdelete")) try {
if (delurlref) {
for (int i = 0; i < urlx.length; i++) sb.removeAllUrlReferences(urlx[i], true);
for (i = 0; i < urlx.length; i++) sb.removeAllUrlReferences(segment, urlx[i], true);
}
if (delurl || delurlref) {
for (int i = 0; i < urlx.length; i++) {
sb.urlRemove(urlx[i]);
for (i = 0; i < urlx.length; i++) {
sb.urlRemove(segment, urlx[i]);
}
}
final Set<String> urlHashes = new HashSet<String>();
for (int i = 0; i < urlx.length; i++) urlHashes.add(urlx[i]);
sb.indexSegment.termIndex().remove(keyhash, urlHashes);
for (i = 0; i < urlx.length; i++) urlHashes.add(urlx[i]);
segment.termIndex().remove(keyhash, urlHashes);
// this shall lead to a presentation of the list; so handle that the remaining program
// thinks that it was called for a list presentation
post.remove("keyhashdelete");
@ -183,7 +204,7 @@ public class IndexControlRWIs_p {
}
final Bitfield flags = compileFlags(post);
final int count = (post.get("lines", "all").equals("all")) ? -1 : post.getInt("lines", -1);
final RankingProcess ranking = genSearchresult(prop, sb, keyhash, flags);
final RankingProcess ranking = genSearchresult(prop, sb, segment, keyhash, flags);
genURLList(prop, keyhash, keystring, ranking, flags, count);
}
@ -212,7 +233,7 @@ public class IndexControlRWIs_p {
// prepare index
ReferenceContainer<WordReference> index;
final long starttime = System.currentTimeMillis();
index = sb.indexSegment.termIndex().get(keyhash, null);
index = segment.termIndex().get(keyhash, null);
// built urlCache
final Iterator<WordReference> urlIter = index.entries();
final HashMap<String, URLMetadataRow> knownURLs = new HashMap<String, URLMetadataRow>();
@ -221,7 +242,7 @@ public class IndexControlRWIs_p {
URLMetadataRow lurl;
while (urlIter.hasNext()) {
iEntry = urlIter.next();
lurl = sb.indexSegment.urlMetadata().load(iEntry.metadataHash(), null, 0);
lurl = segment.urlMetadata().load(iEntry.metadataHash(), null, 0);
if (lurl == null) {
unknownURLEntries.add(iEntry.metadataHash());
urlIter.remove();
@ -251,9 +272,9 @@ public class IndexControlRWIs_p {
// generate list
if (post.containsKey("keyhashsimilar")) try {
final Iterator<ReferenceContainer<WordReference>> containerIt = sb.indexSegment.termIndex().references(keyhash, true, 256, false).iterator();
final Iterator<ReferenceContainer<WordReference>> containerIt = segment.termIndex().references(keyhash, true, 256, false).iterator();
ReferenceContainer<WordReference> container;
int i = 0;
i = 0;
int rows = 0, cols = 0;
prop.put("keyhashsimilar", "1");
while (containerIt.hasNext() && i < 256) {
@ -283,10 +304,10 @@ public class IndexControlRWIs_p {
final String[] supportedBlacklistTypes = env.getConfig("BlackLists.types", "").split(",");
pw = new PrintWriter(new FileWriter(new File(listManager.listsPath, blacklist), true));
yacyURL url;
for (int i=0; i<urlx.length; i++) {
for (i = 0; i < urlx.length; i++) {
urlHashes.add(urlx[i]);
final URLMetadataRow e = sb.indexSegment.urlMetadata().load(urlx[i], null, 0);
sb.indexSegment.urlMetadata().remove(urlx[i]);
final URLMetadataRow e = segment.urlMetadata().load(urlx[i], null, 0);
segment.urlMetadata().remove(urlx[i]);
if (e != null) {
url = e.metadata().url();
pw.println(url.getHost() + "/" + url.getFile());
@ -311,10 +332,10 @@ public class IndexControlRWIs_p {
final String[] supportedBlacklistTypes = AbstractBlacklist.BLACKLIST_TYPES_STRING.split(",");
pw = new PrintWriter(new FileWriter(new File(listManager.listsPath, blacklist), true));
yacyURL url;
for (int i=0; i<urlx.length; i++) {
for (i = 0; i<urlx.length; i++) {
urlHashes.add(urlx[i]);
final URLMetadataRow e = sb.indexSegment.urlMetadata().load(urlx[i], null, 0);
sb.indexSegment.urlMetadata().remove(urlx[i]);
final URLMetadataRow e = segment.urlMetadata().load(urlx[i], null, 0);
segment.urlMetadata().remove(urlx[i]);
if (e != null) {
url = e.metadata().url();
pw.println(url.getHost() + "/.*");
@ -332,7 +353,7 @@ public class IndexControlRWIs_p {
}
}
try {
sb.indexSegment.termIndex().remove(keyhash, urlHashes);
segment.termIndex().remove(keyhash, urlHashes);
} catch (IOException e) {
e.printStackTrace();
}
@ -343,7 +364,7 @@ public class IndexControlRWIs_p {
// insert constants
prop.putNum("wcount", sb.indexSegment.termIndex().sizesMax());
prop.putNum("wcount", segment.termIndex().sizesMax());
// return rewrite properties
return prop;
}
@ -477,9 +498,9 @@ public class IndexControlRWIs_p {
prop.put("searchresult_hosts", hc);
}
public static RankingProcess genSearchresult(final serverObjects prop, final Switchboard sb, final byte[] keyhash, final Bitfield filter) {
public static RankingProcess genSearchresult(final serverObjects prop, final Switchboard sb, Segment segment, final byte[] keyhash, final Bitfield filter) {
final QueryParams query = new QueryParams(new String(keyhash), -1, sb.getRanking(), filter);
final RankingProcess ranked = new RankingProcess(sb.indexSegment, query, Integer.MAX_VALUE, 1);
final RankingProcess ranked = new RankingProcess(segment, query, Integer.MAX_VALUE, 1);
ranked.run();
if (ranked.filteredCount() == 0) {

@ -13,6 +13,15 @@
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data">
<fieldset><legend>URL Retrieval</legend>
<dl>
<dt class="TableCellDark">Select Segment:</dt>
<dd>
<select name="selectSegment" size="1">
#{segments}#
<option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option>
#{/segments}#
</select>
</dd>
<dt class="TableCellDark">Retrieve by URL:</dt>
<dd><input type="text" name="urlstring" value="#[urlstring]#" size="40" maxlength="250" />
<input type="submit" name="urlstringsearch" value="Show Details for URL" />

@ -34,9 +34,11 @@ import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.RotateIterator;
import de.anomic.kelondro.text.MetadataRepository;
import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacySeedDB;
@ -49,10 +51,21 @@ public class IndexControlURLs_p {
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
// set default values
prop.put("urlstring", "");
prop.put("urlhash", "");
prop.put("result", "");
prop.put("ucount", Integer.toString(sb.indexSegment.urlMetadata().size()));
String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default");
int i = 0;
for (String s: sb.indexSegments.segmentNames()) {
prop.put("segments_" + i + "_name", s);
prop.put("segments_" + i + "_selected", (segmentName.equals(s)) ? 1 : 0);
i++;
}
Segment segment = sb.indexSegments.segment(segmentName);
prop.put("segments", i);
prop.put("ucount", Integer.toString(segment.urlMetadata().size()));
prop.put("otherHosts", "");
prop.put("genUrlProfile", 0);
prop.put("statistics", 1);
@ -60,8 +73,22 @@ public class IndexControlURLs_p {
prop.put("statisticslines", 0);
prop.put("reload", 0);
// do segment selection
if (post != null && post.containsKey("segment")) {
// default values
segmentName = post.get("segment", segmentName).trim();
i= 0;
for (String s: sb.indexSegments.segmentNames()) {
prop.put("segments_" + i + "_name", s);
prop.put("segments_" + i + "_selected", (segmentName.equals(s)) ? 1 : 0);
i++;
}
prop.put("segments", i);
segment = sb.indexSegments.segment(segmentName);
}
// show export messages
final MetadataRepository.Export export = sb.indexSegment.urlMetadata().export();
final MetadataRepository.Export export = segment.urlMetadata().export();
if ((export != null) && (export.isAlive())) {
// there is currently a running export
prop.put("lurlexport", 2);
@ -108,20 +135,20 @@ public class IndexControlURLs_p {
prop.put("result", " ");
if (post.containsKey("urlhashdeleteall")) {
final int i = sb.removeAllUrlReferences(urlhash, true);
i = sb.removeAllUrlReferences(segment, urlhash, true);
prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
prop.put("lurlexport", 0);
prop.put("reload", 0);
}
if (post.containsKey("urlhashdelete")) {
final URLMetadataRow entry = sb.indexSegment.urlMetadata().load(urlhash, null, 0);
final URLMetadataRow entry = segment.urlMetadata().load(urlhash, null, 0);
if (entry == null) {
prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {
urlstring = entry.metadata().url().toNormalform(false, true);
prop.put("urlstring", "");
sb.urlRemove(urlhash);
sb.urlRemove(segment, urlhash);
prop.putHTML("result", "Removed URL " + urlstring);
}
prop.put("lurlexport", 0);
@ -137,7 +164,7 @@ public class IndexControlURLs_p {
if ((urlhash == null) || (urlstring == null)) {
prop.put("result", "No input given; nothing deleted.");
} else {
sb.urlRemove(urlhash);
sb.urlRemove(segment, urlhash);
prop.putHTML("result", "Removed URL " + urlstring);
}
prop.put("lurlexport", 0);
@ -149,12 +176,12 @@ public class IndexControlURLs_p {
final yacyURL url = new yacyURL(urlstring, null);
urlhash = url.hash();
prop.put("urlhash", urlhash);
final URLMetadataRow entry = sb.indexSegment.urlMetadata().load(urlhash, null, 0);
final URLMetadataRow entry = segment.urlMetadata().load(urlhash, null, 0);
if (entry == null) {
prop.putHTML("urlstring", "unknown url: " + urlstring);
prop.put("urlhash", "");
} else {
prop.putAll(genUrlProfile(sb, entry, urlhash));
prop.putAll(genUrlProfile(segment, entry, urlhash));
prop.put("statistics", 0);
}
} catch (final MalformedURLException e) {
@ -166,12 +193,12 @@ public class IndexControlURLs_p {
}
if (post.containsKey("urlhashsearch")) {
final URLMetadataRow entry = sb.indexSegment.urlMetadata().load(urlhash, null, 0);
final URLMetadataRow entry = segment.urlMetadata().load(urlhash, null, 0);
if (entry == null) {
prop.putHTML("result", "No Entry for URL hash " + urlhash);
} else {
prop.putHTML("urlstring", entry.metadata().url().toNormalform(false, true));
prop.putAll(genUrlProfile(sb, entry, urlhash));
prop.putAll(genUrlProfile(segment, entry, urlhash));
prop.put("statistics", 0);
}
prop.put("lurlexport", 0);
@ -181,10 +208,10 @@ public class IndexControlURLs_p {
// generate list
if (post.containsKey("urlhashsimilar")) {
try {
final Iterator<URLMetadataRow> entryIt = new RotateIterator<URLMetadataRow>(sb.indexSegment.urlMetadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), sb.indexSegment.termIndex().sizesMax());
final Iterator<URLMetadataRow> entryIt = new RotateIterator<URLMetadataRow>(segment.urlMetadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), segment.termIndex().sizesMax());
final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
URLMetadataRow entry;
int i = 0;
i = 0;
int rows = 0, cols = 0;
prop.put("urlhashsimilar", "1");
while (entryIt.hasNext() && i < 256) {
@ -228,7 +255,7 @@ public class IndexControlURLs_p {
final File f = new File(s);
f.getParentFile().mkdirs();
final String filter = post.get("exportfilter", ".*");
final MetadataRepository.Export running = sb.indexSegment.urlMetadata().export(f, filter, null, format, dom);
final MetadataRepository.Export running = segment.urlMetadata().export(f, filter, null, format, dom);
prop.put("lurlexport_exportfile", s);
prop.put("lurlexport_urlcount", running.count());
@ -241,7 +268,7 @@ public class IndexControlURLs_p {
if (post.containsKey("deletedomain")) {
String hp = post.get("hashpart");
try {
sb.indexSegment.urlMetadata().deleteDomain(hp);
segment.urlMetadata().deleteDomain(hp);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
@ -257,7 +284,7 @@ public class IndexControlURLs_p {
prop.put("statistics_lines", count);
int cnt = 0;
try {
statsiter = sb.indexSegment.urlMetadata().statistics(count);
statsiter = segment.urlMetadata().statistics(count);
boolean dark = true;
MetadataRepository.hostStat hs;
while (statsiter.hasNext() && cnt < count) {
@ -280,12 +307,12 @@ public class IndexControlURLs_p {
}
// insert constants
prop.putNum("ucount", sb.indexSegment.urlMetadata().size());
prop.putNum("ucount", segment.urlMetadata().size());
// return rewrite properties
return prop;
}
private static serverObjects genUrlProfile(final Switchboard switchboard, final URLMetadataRow entry, final String urlhash) {
private static serverObjects genUrlProfile(final Segment segment, final URLMetadataRow entry, final String urlhash) {
final serverObjects prop = new serverObjects();
if (entry == null) {
prop.put("genUrlProfile", "1");
@ -293,7 +320,7 @@ public class IndexControlURLs_p {
return prop;
}
final URLMetadataRow.Components metadata = entry.metadata();
final URLMetadataRow le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : switchboard.indexSegment.urlMetadata().load(entry.referrerHash(), null, 0);
final URLMetadataRow le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : segment.urlMetadata().load(entry.referrerHash(), null, 0);
if (metadata.url() == null) {
prop.put("genUrlProfile", "1");
prop.put("genUrlProfile_urlhash", urlhash);

@ -35,6 +35,8 @@ import java.util.Date;
import de.anomic.crawler.Importer;
import de.anomic.crawler.NoticeURLImporter;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.util.ByteBuffer;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.search.Switchboard;
@ -45,10 +47,22 @@ public final class IndexImport_p {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard switchboard = (Switchboard) env;
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
int activeCount = 0;
// get segment
Segment indexSegment = null;
if (post != null && post.containsKey("segment")) {
String segmentName = post.get("segment");
if (sb.indexSegments.segmentExist(segmentName)) {
indexSegment = sb.indexSegments.segment(segmentName);
}
} else {
// take default segment
indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
}
if (post != null) {
if (post.containsKey("startIndexDbImport")) {
@ -56,13 +70,13 @@ public final class IndexImport_p {
final boolean startImport = true;
if (startImport) {
final Importer importerThread = new NoticeURLImporter(
switchboard.queuesRoot,
switchboard.crawlQueues,
switchboard.crawler.profilesActiveCrawls,
switchboard.dbImportManager);
sb.queuesRoot,
sb.crawlQueues,
sb.crawler.profilesActiveCrawls,
sb.dbImportManager);
if (importerThread != null) {
importerThread.setJobID(switchboard.dbImportManager.generateUniqueJobID());
importerThread.setJobID(sb.dbImportManager.generateUniqueJobID());
importerThread.startIt();
}
prop.put("LOCATION","");
@ -80,7 +94,7 @@ public final class IndexImport_p {
errorOut.close();
}
} else if (post.containsKey("clearFinishedJobList")) {
switchboard.dbImportManager.finishedJobs.clear();
sb.dbImportManager.finishedJobs.clear();
prop.put("LOCATION", "");
return prop;
} else if (
@ -88,9 +102,9 @@ public final class IndexImport_p {
(post.containsKey("pauseIndexDbImport")) ||
(post.containsKey("continueIndexDbImport"))
) {
// getting the job nr of the thread
// get the job nr of the thread
final String jobID = post.get("jobNr");
final Importer importer = switchboard.dbImportManager.getImporterByID(Integer.valueOf(jobID).intValue());
final Importer importer = sb.dbImportManager.getImporterByID(Integer.valueOf(jobID).intValue());
if (importer != null) {
if (post.containsKey("stopIndexDbImport")) {
try {
@ -110,13 +124,13 @@ public final class IndexImport_p {
}
}
prop.putNum("wcount", switchboard.indexSegment.termIndex().sizesMax());
prop.putNum("ucount", switchboard.indexSegment.urlMetadata().size());
prop.putNum("wcount", indexSegment.termIndex().sizesMax());
prop.putNum("ucount", indexSegment.urlMetadata().size());
/*
* Loop over all currently running jobs
*/
final Importer[] importThreads = switchboard.dbImportManager.getRunningImporter();
final Importer[] importThreads = sb.dbImportManager.getRunningImporter();
activeCount = importThreads.length;
for (int i=0; i < activeCount; i++) {
@ -154,7 +168,7 @@ public final class IndexImport_p {
/*
* Loop over all finished jobs
*/
final Importer[] finishedJobs = switchboard.dbImportManager.getFinishedImporter();
final Importer[] finishedJobs = sb.dbImportManager.getFinishedImporter();
for (int i=0; i<finishedJobs.length; i++) {
final Importer currThread = finishedJobs[i];
final String error = currThread.getError();

@ -29,6 +29,8 @@
//import java.io.*;
//import de.anomic.tools.*;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.Segments;
import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants;
import de.anomic.server.serverObjects;
@ -38,38 +40,41 @@ public class IndexShare_p {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard switchboard = (Switchboard) env;
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
if(switchboard == null) {
prop.put("linkfreq", "30");
prop.put("wordfreq", "10");
prop.put("dtable", "");
prop.put("rtable", "");
prop.putNum("wcount", 0);
prop.putNum("ucount", 0);
return prop; // be save
// get segment
Segment indexSegment = null;
if (post != null && post.containsKey("segment")) {
String segmentName = post.get("segment");
if (sb.indexSegments.segmentExist(segmentName)) {
indexSegment = sb.indexSegments.segment(segmentName);
}
} else {
// take default segment
indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
}
if (post == null) {
prop.put("linkfreq", switchboard.getConfigLong("defaultLinkReceiveFrequency",30));
prop.put("wordfreq", switchboard.getConfigLong("defaultWordReceiveFrequency",10));
prop.put("linkfreq", sb.getConfigLong("defaultLinkReceiveFrequency",30));
prop.put("wordfreq", sb.getConfigLong("defaultWordReceiveFrequency",10));
prop.put("dtable", "");
prop.put("rtable", "");
prop.putNum("wcount", switchboard.indexSegment.termIndex().sizesMax());
prop.putNum("ucount", switchboard.indexSegment.urlMetadata().size());
prop.putNum("wcount", indexSegment.termIndex().sizesMax());
prop.putNum("ucount", indexSegment.urlMetadata().size());
return prop; // be save
}
if (post.containsKey("indexsharesetting")) {
switchboard.setConfig(SwitchboardConstants.INDEX_DIST_ALLOW, (post.containsKey("distribute")) ? "true" : "false");
switchboard.setConfig("allowReceiveIndex", (post.containsKey("receive")) ? "true" : "false");
switchboard.setConfig("defaultLinkReceiveFrequency", post.get("linkfreq", "30"));
switchboard.setConfig("defaultWordReceiveFrequency", post.get("wordfreq", "10"));
sb.setConfig(SwitchboardConstants.INDEX_DIST_ALLOW, (post.containsKey("distribute")) ? "true" : "false");
sb.setConfig("allowReceiveIndex", (post.containsKey("receive")) ? "true" : "false");
sb.setConfig("defaultLinkReceiveFrequency", post.get("linkfreq", "30"));
sb.setConfig("defaultWordReceiveFrequency", post.get("wordfreq", "10"));
}
// insert constants
prop.putNum("wcount", switchboard.indexSegment.termIndex().sizesMax());
prop.putNum("ucount", switchboard.indexSegment.urlMetadata().size());
prop.putNum("wcount", indexSegment.termIndex().sizesMax());
prop.putNum("ucount", indexSegment.urlMetadata().size());
// return rewrite properties
return prop;

@ -41,7 +41,7 @@ public class PerformanceGraph {
final int width = post.getInt("width", 660);
final int height = post.getInt("height", 240);
return ProfilingGraph.performanceGraph(width, height, sb.indexSegment.urlMetadata().size() + " URLS / " + sb.indexSegment.termIndex().sizesMax() + " WORDS IN INDEX / " + sb.indexSegment.termIndex().getBufferSize() + " WORDS IN CACHE");
return ProfilingGraph.performanceGraph(width, height, sb.indexSegments.URLCount() + " URLS / " + sb.indexSegments.RWICount() + " WORDS IN INDEX / " + sb.indexSegments.RWIBufferCount() + " WORDS IN CACHE");
}
}

@ -30,6 +30,8 @@ import java.util.Iterator;
import java.util.Map;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants;
@ -52,15 +54,28 @@ public class PerformanceQueues_p {
performanceProfiles.put("defaults/performance_dht.profile", "prefer DHT");
}
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch sb) {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard switchboard = (Switchboard) sb;
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
File defaultSettingsFile = new File(switchboard.getRootPath(), "defaults/yacy.init");
File defaultSettingsFile = new File(sb.getRootPath(), "defaults/yacy.init");
// get segment
Segment indexSegment = null;
if (post != null && post.containsKey("segment")) {
String segmentName = post.get("segment");
if (sb.indexSegments.segmentExist(segmentName)) {
indexSegment = sb.indexSegments.segment(segmentName);
}
} else {
// take default segment
indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
}
if(post != null) {
if(post.containsKey("defaultFile")){
// TODO check file-path!
final File value = new File(switchboard.getRootPath(), post.get("defaultFile", "defaults/yacy.init"));
final File value = new File(sb.getRootPath(), post.get("defaultFile", "defaults/yacy.init"));
// check if value is readable file
if(value.exists() && value.isFile() && value.canRead()) {
defaultSettingsFile = value;
@ -77,7 +92,7 @@ public class PerformanceQueues_p {
}
}
final Map<String, String> defaultSettings = ((post == null) || (!(post.containsKey("submitdefault")))) ? null : FileUtils.loadMap(defaultSettingsFile);
Iterator<String> threads = switchboard.threadNames();
Iterator<String> threads = sb.threadNames();
String threadName;
serverBusyThread thread;
@ -88,7 +103,7 @@ public class PerformanceQueues_p {
long blocktime_total = 0, sleeptime_total = 0, exectime_total = 0;
while (threads.hasNext()) {
threadName = threads.next();
thread = switchboard.getThread(threadName);
thread = sb.getThread(threadName);
blocktime_total += thread.getBlockTime();
sleeptime_total += thread.getSleepTime();
exectime_total += thread.getExecTime();
@ -101,7 +116,7 @@ public class PerformanceQueues_p {
long blocktime, sleeptime, exectime;
long idlesleep, busysleep, memuse, memprereq;
int queuesize;
threads = switchboard.threadNames();
threads = sb.threadNames();
int c = 0;
long idleCycles, busyCycles, memshortageCycles;
// set profile?
@ -110,13 +125,13 @@ public class PerformanceQueues_p {
final boolean setDelay = (post != null) && (post.containsKey("submitdelay"));
// save used settings file to config
if (setProfile && post != null){
switchboard.setConfig("performanceProfile", post.get("defaultFile", "defaults/yacy.init"));
switchboard.setConfig("performanceSpeed", post.getInt("profileSpeed", 100));
sb.setConfig("performanceProfile", post.get("defaultFile", "defaults/yacy.init"));
sb.setConfig("performanceSpeed", post.getInt("profileSpeed", 100));
}
while (threads.hasNext()) {
threadName = threads.next();
thread = switchboard.getThread(threadName);
thread = sb.getThread(threadName);
// set values to templates
prop.put("table_" + c + "_threadname", threadName);
@ -154,21 +169,21 @@ public class PerformanceQueues_p {
prop.putNum("table_" + c + "_memusepercycle", (busyCycles == 0) ? -1 : memuse / busyCycles / 1024);
// load with old values
idlesleep = switchboard.getConfigLong(threadName + "_idlesleep" , 1000);
busysleep = switchboard.getConfigLong(threadName + "_busysleep", 100);
memprereq = switchboard.getConfigLong(threadName + "_memprereq", 0);
idlesleep = sb.getConfigLong(threadName + "_idlesleep" , 1000);
busysleep = sb.getConfigLong(threadName + "_busysleep", 100);
memprereq = sb.getConfigLong(threadName + "_memprereq", 0);
if (setDelay && post != null) {
// load with new values
idlesleep = post.getLong(threadName + "_idlesleep", idlesleep);
busysleep = post.getLong(threadName + "_busysleep", busysleep);
memprereq = post.getLong(threadName + "_memprereq", memprereq) * 1024;
if (memprereq == 0) memprereq = switchboard.getConfigLong(threadName + "_memprereq", 0);
if (memprereq == 0) memprereq = sb.getConfigLong(threadName + "_memprereq", 0);
// check values to prevent short-cut loops
if (idlesleep < 1000) idlesleep = 1000;
if (threadName.equals("10_httpd")) { idlesleep = 0; busysleep = 0; memprereq = 0; }
onTheFlyReconfiguration(switchboard, threadName, idlesleep, busysleep, memprereq);
onTheFlyReconfiguration(sb, threadName, idlesleep, busysleep, memprereq);
} if (setProfile) {
if (threadName.equals(SwitchboardConstants.PEER_PING)
|| threadName.equals(SwitchboardConstants.SEED_UPLOAD)
@ -177,7 +192,7 @@ public class PerformanceQueues_p {
) { /* do not change any values */ }
else if (threadName.equals(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER)
|| threadName.equals(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL)) {
switchboard.setRemotecrawlPPM(Math.max(1, (int) (switchboard.getConfigLong("network.unit.remotecrawl.speed", 60) / multiplier)));
sb.setRemotecrawlPPM(Math.max(1, (int) (sb.getConfigLong("network.unit.remotecrawl.speed", 60) / multiplier)));
}
else {
// load with new values
@ -190,7 +205,7 @@ public class PerformanceQueues_p {
if (threadName.equals("10_httpd")) { idlesleep = 0; busysleep = 0; memprereq = 0; }
//if (threadName.equals(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) && (busysleep < 50)) busysleep = 50;
onTheFlyReconfiguration(switchboard, threadName, idlesleep, busysleep, memprereq);
onTheFlyReconfiguration(sb, threadName, idlesleep, busysleep, memprereq);
}
}
prop.put("table_" + c + "_idlesleep", idlesleep);
@ -199,14 +214,14 @@ public class PerformanceQueues_p {
// disallow setting of memprereq for indexer to prevent db from throwing OOMs
prop.put("table_" + c + "_disabled", /*(threadName.endsWith("_indexing")) ? 1 :*/ "0");
prop.put("table_" + c + "_recommendation", threadName.endsWith("_indexing") ? "1" : "0");
prop.putNum("table_" + c + "_recommendation_value", threadName.endsWith("_indexing") ? (switchboard.indexSegment.termIndex().minMem() / 1024) : 0);
prop.putNum("table_" + c + "_recommendation_value", threadName.endsWith("_indexing") ? (indexSegment.termIndex().minMem() / 1024) : 0);
c++;
}
prop.put("table", c);
// performance profiles
c = 0;
final String usedfile = switchboard.getConfig("performanceProfile", "defaults/yacy.init");
final String usedfile = sb.getConfig("performanceProfile", "defaults/yacy.init");
for(final String filename: performanceProfiles.keySet()) {
prop.put("profile_" + c + "_filename", filename);
prop.put("profile_" + c + "_description", performanceProfiles.get(filename));
@ -217,7 +232,7 @@ public class PerformanceQueues_p {
c = 0;
final int[] speedValues = {200,150,100,50,25,10};
final int usedspeed = Integer.parseInt(switchboard.getConfig("performanceSpeed", "100"));
final int usedspeed = Integer.parseInt(sb.getConfig("performanceSpeed", "100"));
for(final int speed: speedValues){
prop.put("speed_" + c + "_value", speed);
prop.put("speed_" + c + "_label", speed + " %");
@ -228,8 +243,8 @@ public class PerformanceQueues_p {
if ((post != null) && (post.containsKey("cacheSizeSubmit"))) {
final int wordCacheMaxCount = post.getInt("wordCacheMaxCount", 20000);
switchboard.setConfig(SwitchboardConstants.WORDCACHE_MAX_COUNT, Integer.toString(wordCacheMaxCount));
switchboard.indexSegment.termIndex().setBufferMaxWordCount(wordCacheMaxCount);
sb.setConfig(SwitchboardConstants.WORDCACHE_MAX_COUNT, Integer.toString(wordCacheMaxCount));
indexSegment.termIndex().setBufferMaxWordCount(wordCacheMaxCount);
}
if ((post != null) && (post.containsKey("poolConfig"))) {
@ -237,17 +252,17 @@ public class PerformanceQueues_p {
/*
* configuring the crawler pool
*/
// getting the current crawler pool configuration
// get the current crawler pool configuration
int maxBusy = Integer.parseInt(post.get("Crawler Pool_maxActive","8"));
// storing the new values into configfile
switchboard.setConfig(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX,maxBusy);
sb.setConfig(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX,maxBusy);
//switchboard.setConfig("crawler.MinIdleThreads",minIdle);
/*
* configuring the http pool
*/
final serverThread httpd = switchboard.getThread("10_httpd");
final serverThread httpd = sb.getThread("10_httpd");
try {
maxBusy = Integer.parseInt(post.get("httpd Session Pool_maxActive","8"));
} catch (final NumberFormatException e) {
@ -257,61 +272,61 @@ public class PerformanceQueues_p {
((serverCore)httpd).setMaxSessionCount(maxBusy);
// storing the new values into configfile
switchboard.setConfig("httpdMaxBusySessions",maxBusy);
sb.setConfig("httpdMaxBusySessions",maxBusy);
}
if ((post != null) && (post.containsKey("PrioritySubmit"))) {
switchboard.setConfig("javastart_priority",post.get("YaCyPriority","0"));
sb.setConfig("javastart_priority",post.get("YaCyPriority","0"));
}
if ((post != null) && (post.containsKey("onlineCautionSubmit"))) {
switchboard.setConfig(SwitchboardConstants.PROXY_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseProxy", 30000)));
switchboard.setConfig(SwitchboardConstants.LOCALSEACH_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseLocalsearch", 30000)));
switchboard.setConfig(SwitchboardConstants.REMOTESEARCH_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseRemotesearch", 30000)));
sb.setConfig(SwitchboardConstants.PROXY_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseProxy", 30000)));
sb.setConfig(SwitchboardConstants.LOCALSEACH_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseLocalsearch", 30000)));
sb.setConfig(SwitchboardConstants.REMOTESEARCH_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseRemotesearch", 30000)));
}
if ((post != null) && (post.containsKey("minimumDeltaSubmit"))) {
final long minimumLocalDelta = post.getLong("minimumLocalDelta", switchboard.crawlQueues.noticeURL.getMinimumLocalDelta());
final long minimumGlobalDelta = post.getLong("minimumGlobalDelta", switchboard.crawlQueues.noticeURL.getMinimumGlobalDelta());
switchboard.setConfig("minimumLocalDelta", minimumLocalDelta);
switchboard.setConfig("minimumGlobalDelta", minimumGlobalDelta);
switchboard.crawlQueues.noticeURL.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
final long minimumLocalDelta = post.getLong("minimumLocalDelta", sb.crawlQueues.noticeURL.getMinimumLocalDelta());
final long minimumGlobalDelta = post.getLong("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta());
sb.setConfig("minimumLocalDelta", minimumLocalDelta);
sb.setConfig("minimumGlobalDelta", minimumGlobalDelta);
sb.crawlQueues.noticeURL.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
}
// delta settings
prop.put("minimumLocalDelta", switchboard.crawlQueues.noticeURL.getMinimumLocalDelta());
prop.put("minimumGlobalDelta", switchboard.crawlQueues.noticeURL.getMinimumGlobalDelta());
prop.put("minimumLocalDelta", sb.crawlQueues.noticeURL.getMinimumLocalDelta());
prop.put("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta());
// table cache settings
prop.putNum("urlCacheSize", switchboard.indexSegment.urlMetadata().writeCacheSize());
prop.putNum("wordCacheSize", switchboard.indexSegment.termIndex().getBufferSize());
prop.putNum("wordCacheSizeKBytes", switchboard.indexSegment.termIndex().getBufferSizeBytes()/1024);
prop.putNum("maxURLinCache", switchboard.indexSegment.termIndex().getBufferMaxReferences());
prop.putNum("maxAgeOfCache", switchboard.indexSegment.termIndex().getBufferMaxAge() / 1000 / 60); // minutes
prop.putNum("minAgeOfCache", switchboard.indexSegment.termIndex().getBufferMinAge() / 1000 / 60); // minutes
prop.putNum("maxWaitingWordFlush", switchboard.getConfigLong("maxWaitingWordFlush", 180));
prop.put("wordCacheMaxCount", switchboard.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 20000));
prop.put("crawlPauseProxy", switchboard.getConfigLong(SwitchboardConstants.PROXY_ONLINE_CAUTION_DELAY, 30000));
prop.put("crawlPauseLocalsearch", switchboard.getConfigLong(SwitchboardConstants.LOCALSEACH_ONLINE_CAUTION_DELAY, 30000));
prop.put("crawlPauseRemotesearch", switchboard.getConfigLong(SwitchboardConstants.REMOTESEARCH_ONLINE_CAUTION_DELAY, 30000));
prop.putNum("crawlPauseProxyCurrent", (System.currentTimeMillis() - switchboard.proxyLastAccess) / 1000);
prop.putNum("crawlPauseLocalsearchCurrent", (System.currentTimeMillis() - switchboard.localSearchLastAccess) / 1000);
prop.putNum("crawlPauseRemotesearchCurrent", (System.currentTimeMillis() - switchboard.remoteSearchLastAccess) / 1000);
prop.putNum("urlCacheSize", indexSegment.urlMetadata().writeCacheSize());
prop.putNum("wordCacheSize", indexSegment.termIndex().getBufferSize());
prop.putNum("wordCacheSizeKBytes", indexSegment.termIndex().getBufferSizeBytes()/1024);
prop.putNum("maxURLinCache", indexSegment.termIndex().getBufferMaxReferences());
prop.putNum("maxAgeOfCache", indexSegment.termIndex().getBufferMaxAge() / 1000 / 60); // minutes
prop.putNum("minAgeOfCache", indexSegment.termIndex().getBufferMinAge() / 1000 / 60); // minutes
prop.putNum("maxWaitingWordFlush", sb.getConfigLong("maxWaitingWordFlush", 180));
prop.put("wordCacheMaxCount", sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 20000));
prop.put("crawlPauseProxy", sb.getConfigLong(SwitchboardConstants.PROXY_ONLINE_CAUTION_DELAY, 30000));
prop.put("crawlPauseLocalsearch", sb.getConfigLong(SwitchboardConstants.LOCALSEACH_ONLINE_CAUTION_DELAY, 30000));
prop.put("crawlPauseRemotesearch", sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_ONLINE_CAUTION_DELAY, 30000));
prop.putNum("crawlPauseProxyCurrent", (System.currentTimeMillis() - sb.proxyLastAccess) / 1000);
prop.putNum("crawlPauseLocalsearchCurrent", (System.currentTimeMillis() - sb.localSearchLastAccess) / 1000);
prop.putNum("crawlPauseRemotesearchCurrent", (System.currentTimeMillis() - sb.remoteSearchLastAccess) / 1000);
// table thread pool settings
prop.put("pool_0_name","Crawler Pool");
prop.put("pool_0_maxActive", switchboard.getConfigLong("crawler.MaxActiveThreads", 0));
prop.put("pool_0_numActive",switchboard.crawlQueues.size());
prop.put("pool_0_maxActive", sb.getConfigLong("crawler.MaxActiveThreads", 0));
prop.put("pool_0_numActive",sb.crawlQueues.size());
final serverThread httpd = switchboard.getThread("10_httpd");
final serverThread httpd = sb.getThread("10_httpd");
prop.put("pool_1_name", "httpd Session Pool");
prop.put("pool_1_maxActive", ((serverCore)httpd).getMaxSessionCount());
prop.put("pool_1_numActive", ((serverCore)httpd).getJobCount());
prop.put("pool", "2");
final long curr_prio = switchboard.getConfigLong("javastart_priority",0);
final long curr_prio = sb.getConfigLong("javastart_priority",0);
prop.put("priority_normal",(curr_prio==0) ? "1" : "0");
prop.put("priority_below",(curr_prio==10) ? "1" : "0");
prop.put("priority_low",(curr_prio==20) ? "1" : "0");

@ -38,6 +38,8 @@ import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Request;
import de.anomic.http.metadata.HeaderFramework;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.Segments;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -58,11 +60,23 @@ public class QuickCrawlLink_p {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard) env;
// get segment
Segment indexSegment = null;
if (post != null && post.containsKey("segment")) {
String segmentName = post.get("segment");
if (sb.indexSegments.segmentExist(segmentName)) {
indexSegment = sb.indexSegments.segment(segmentName);
}
} else {
// take default segment
indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
}
if (post == null) {
// send back usage example
prop.put("mode", "0");
// getting the http host header
// get the http host header
final String hostSocket = header.get(HeaderFramework.CONNECTION_PROP_HOST);
//String host = hostSocket;
@ -80,7 +94,7 @@ public class QuickCrawlLink_p {
}
prop.put("mode", "1");
// getting the URL
// get the URL
String crawlingStart = post.get("url",null);
try {
crawlingStart = URLDecoder.decode(crawlingStart, "UTF-8");
@ -89,10 +103,10 @@ public class QuickCrawlLink_p {
e1.printStackTrace();
}
// getting the browser title
// get the browser title
final String title = post.get("title",null);
// getting other parameters if set
// get other parameters if set
final String crawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_BAD_URL);
final int CrawlingDepth = Integer.parseInt(post.get("crawlingDepth", "0"));
@ -123,7 +137,7 @@ public class QuickCrawlLink_p {
}
final String urlhash = crawlingStartURL.hash();
sb.indexSegment.urlMetadata().remove(urlhash);
indexSegment.urlMetadata().remove(urlhash);
sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
sb.crawlQueues.errorURL.remove(urlhash);

@ -345,7 +345,7 @@ public class SettingsAck_p {
}
if (post.containsKey("seedSettings")) {
// getting the currently used uploading method
// get the currently used uploading method
final String oldSeedUploadMethod = env.getConfig("seedUploadMethod","none");
final String newSeedUploadMethod = post.get("seedUploadMethod");
final String oldSeedURLStr = sb.peers.mySeed().get(yacySeed.SEEDLIST, "");
@ -385,7 +385,7 @@ public class SettingsAck_p {
final HashMap<String, String> uploaders = yacyCore.getSeedUploadMethods();
final Iterator<String> uploaderKeys = uploaders.keySet().iterator();
while (uploaderKeys.hasNext()) {
// getting the uploader module name
// get the uploader module name
final String uploaderName = uploaderKeys.next();
@ -452,7 +452,7 @@ public class SettingsAck_p {
// Crawler settings
if (post.containsKey("crawlerSettings")) {
// getting Crawler Timeout
// get Crawler Timeout
String timeoutStr = post.get("crawler.clientTimeout");
if (timeoutStr==null||timeoutStr.length()==0) timeoutStr = "10000";
@ -467,7 +467,7 @@ public class SettingsAck_p {
return prop;
}
// getting maximum http file size
// get maximum http file size
String maxSizeStr = post.get("crawler.http.maxFileSize");
if (maxSizeStr==null||maxSizeStr.length()==0) timeoutStr = "-1";
@ -484,7 +484,7 @@ public class SettingsAck_p {
return prop;
}
// getting maximum ftp file size
// get maximum ftp file size
maxSizeStr = post.get("crawler.ftp.maxFileSize");
if (maxSizeStr==null||maxSizeStr.length()==0) timeoutStr = "-1";

@ -45,6 +45,8 @@ import de.anomic.http.client.Client;
import de.anomic.http.client.Cache;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.http.metadata.ResponseHeader;
import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.search.Switchboard;
@ -72,6 +74,18 @@ public class ViewFile {
final int display = (post == null) ? 0 : post.getInt("display", 0);
// get segment
Segment indexSegment = null;
if (post != null && post.containsKey("segment")) {
String segmentName = post.get("segment");
if (sb.indexSegments.segmentExist(segmentName)) {
indexSegment = sb.indexSegments.segment(segmentName);
}
} else {
// take default segment
indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
}
prop.put("display", display);
prop.put("error_display", display);
@ -90,12 +104,12 @@ public class ViewFile {
int size = 0;
boolean pre = false;
// getting the url hash from which the content should be loaded
// get the url hash from which the content should be loaded
final String urlHash = post.get("urlHash","");
if (urlHash.length() > 0) {
// getting the urlEntry that belongs to the url hash
// get the urlEntry that belongs to the url hash
URLMetadataRow urlEntry = null;
urlEntry = sb.indexSegment.urlMetadata().load(urlHash, null, 0);
urlEntry = indexSegment.urlMetadata().load(urlHash, null, 0);
if (urlEntry == null) {
prop.put("error", "2");
prop.put("viewMode",VIEW_MODE_NO_TEXT);

@ -45,6 +45,8 @@ import de.anomic.data.listManager;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.document.parser.html.TransformerWriter;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants;
@ -86,6 +88,18 @@ public class WatchCrawler_p {
prop.put("list-remote", 0);
prop.put("forwardToCrawlStart", "0");
// get segment
Segment indexSegment = null;
if (post != null && post.containsKey("segment")) {
String segmentName = post.get("segment");
if (sb.indexSegments.segmentExist(segmentName)) {
indexSegment = sb.indexSegments.segment(segmentName);
}
} else {
// take default segment
indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
}
prop.put("info", "0");
if (post != null) {
// a crawl start
@ -216,7 +230,7 @@ public class WatchCrawler_p {
// first delete old entry, if exists
final yacyURL url = new yacyURL(crawlingStart, null);
final String urlhash = url.hash();
sb.indexSegment.urlMetadata().remove(urlhash);
indexSegment.urlMetadata().remove(urlhash);
sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
sb.crawlQueues.errorURL.remove(urlhash);

@ -93,7 +93,7 @@ public class WebStructurePicture_p {
try {
hash = (new yacyURL("http://" + host, null)).hash().substring(6);
} catch (final MalformedURLException e) {e.printStackTrace();}
assert (sb.webStructure.outgoingReferences(hash) != null);
//assert (sb.webStructure.outgoingReferences(hash) != null);
// recursively find domains, up to a specific depth
final ymageGraph graph = new ymageGraph();

@ -6,6 +6,8 @@ import java.util.Locale;
import de.anomic.crawler.NoticedURL;
import de.anomic.crawler.retrieval.Request;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.Segments;
import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants;
import de.anomic.server.serverObjects;
@ -28,16 +30,22 @@ public class queues_p {
final Switchboard sb = (Switchboard) env;
//wikiCode wikiTransformer = new wikiCode(switchboard);
final serverObjects prop = new serverObjects();
if (post == null || !post.containsKey("html"))
Segment segment = null;
if (post == null || !post.containsKey("html")) {
prop.setLocalized(false);
if (post.containsKey("segment") && sb.verifyAuthentication(header, false)) {
segment = sb.indexSegments.segment(post.get("segment"));
}
}
if (segment == null) segment = sb.indexSegments.segment(Segments.Process.PUBLIC);
prop.put("rejected", "0");
//int showRejectedCount = 10;
yacySeed initiator;
// index size
prop.putNum("urlpublictextSize", sb.indexSegment.urlMetadata().size());
prop.putNum("rwipublictextSize", sb.indexSegment.termIndex().sizesMax());
prop.putNum("urlpublictextSize", segment.urlMetadata().size());
prop.putNum("rwipublictextSize", segment.termIndex().sizesMax());
// loader queue
prop.put("loaderSize", Integer.toString(sb.crawlQueues.size()));

@ -3,6 +3,8 @@
import de.anomic.http.io.ByteCountInputStream;
import de.anomic.http.io.ByteCountOutputStream;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants;
@ -17,14 +19,21 @@ public class status_p {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
if (post == null || !post.containsKey("html"))
Segment segment = null;
if (post == null || !post.containsKey("html")) {
prop.setLocalized(false);
if (post.containsKey("segment") && sb.verifyAuthentication(header, false)) {
segment = sb.indexSegments.segment(post.get("segment"));
}
}
if (segment == null) segment = sb.indexSegments.segment(Segments.Process.PUBLIC);
prop.put("rejected", "0");
sb.updateMySeed();
final int cacheMaxSize = (int) sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 10000);
prop.putNum("ppm", sb.currentPPM());
prop.putNum("qpm", sb.peers.mySeed().getQPM());
prop.put("wordCacheSize", Integer.toString(sb.indexSegment.termIndex().getBufferSize()));
prop.put("wordCacheSize", Integer.toString(segment.termIndex().getBufferSize()));
prop.put("wordCacheMaxSize", Integer.toString(cacheMaxSize));
//
// memory usage and system attributes

@ -32,6 +32,7 @@ import de.anomic.document.Word;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.text.TermSearch;
import de.anomic.kelondro.text.referencePrototype.WordReference;
import de.anomic.kelondro.util.DateFormatter;
@ -52,6 +53,13 @@ public final class timeline {
if ((post == null) || (env == null)) return prop;
final boolean authenticated = sb.adminAuthenticated(header) >= 2;
Segment segment = null;
if (post.containsKey("segment") && authenticated) {
segment = sb.indexSegments.segment(post.get("segment"));
} else {
segment = sb.indexSegments.segment(Segments.Process.PUBLIC);
}
final String querystring = post.get("query", ""); // a string of word hashes that shall be searched and combined
final int count = Math.min((authenticated) ? 1000 : 10, post.getInt("maximumRecords", 1000)); // SRU syntax
final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE);
@ -80,7 +88,7 @@ public final class timeline {
//yacyCore.log.logInfo("INIT TIMELINE SEARCH: " + plasmaSearchQuery.anonymizedQueryHashes(query[0]) + " - " + count + " links");
// get the index container with the result vector
final TermSearch<WordReference> search = sb.indexSegment.termIndex().query(
final TermSearch<WordReference> search = segment.termIndex().query(
q,
Word.words2hashes(query[1]),
null,

@ -28,6 +28,8 @@
import java.net.MalformedURLException;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
@ -42,6 +44,15 @@ public class yacydoc {
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
Segment segment = null;
if (post == null || !post.containsKey("html")) {
if (post.containsKey("segment") && sb.verifyAuthentication(header, false)) {
segment = sb.indexSegments.segment(post.get("segment"));
}
}
if (segment == null) segment = sb.indexSegments.segment(Segments.Process.PUBLIC);
prop.put("dc_title", "");
prop.put("dc_creator", "");
prop.put("dc_description", "");
@ -68,14 +79,14 @@ public class yacydoc {
}
if (urlhash == null || urlhash.length() == 0) return prop;
final URLMetadataRow entry = sb.indexSegment.urlMetadata().load(urlhash, null, 0);
final URLMetadataRow entry = segment.urlMetadata().load(urlhash, null, 0);
if (entry == null) return prop;
final URLMetadataRow.Components metadata = entry.metadata();
if (metadata.url() == null) {
return prop;
}
final URLMetadataRow le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : sb.indexSegment.urlMetadata().load(entry.referrerHash(), null, 0);
final URLMetadataRow le = ((entry.referrerHash() == null) || (entry.referrerHash().length() != yacySeedDB.commonHashLength)) ? null : segment.urlMetadata().load(entry.referrerHash(), null, 0);
prop.putXML("dc_title", metadata.dc_title());
prop.putXML("dc_creator", metadata.dc_creator());

@ -50,7 +50,7 @@ public class autoconfig {
final boolean yacyonly = env.getConfigBool(SwitchboardConstants.PROXY_YACY_ONLY, false);
// getting the http host header
// get the http host header
final String hostSocket = header.get(HeaderFramework.CONNECTION_PROP_HOST);
String host = hostSocket;

@ -73,7 +73,7 @@ public class sharedBlacklist_p {
// return variable that accumulates replacements
final serverObjects prop = new serverObjects();
// getting the name of the destination blacklist
// get the name of the destination blacklist
String selectedBlacklistName = "";
if( post != null && post.containsKey("currentBlacklist") ){
selectedBlacklistName = post.get("currentBlacklist");
@ -113,7 +113,7 @@ public class sharedBlacklist_p {
* Import blacklist from other peer
* ====================================================== */
// getting the source peer hash
// get the source peer hash
final String Hash = post.get("hash");
// generate the download URL

@ -31,6 +31,7 @@ import java.io.IOException;
import de.anomic.crawler.ZURL;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
@ -136,7 +137,7 @@ public final class crawlReceipt {
if (result.equals("fill")) try {
// put new entry into database
sb.indexSegment.urlMetadata().store(entry);
sb.indexSegments.urlMetadata(Segments.Process.RECEIPTS).store(entry);
sb.crawlResults.stack(entry, youare, iam, 1);
sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work has been done
log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + entry.hash() + ":" + metadata.url().toNormalform(false, true));

@ -167,10 +167,10 @@ public final class message {
try {
if (!Boolean.valueOf(sb.getConfig("msgForwardingEnabled","false")).booleanValue()) return;
// getting the recipient address
// get the recipient address
final String sendMailTo = sb.getConfig("msgForwardingTo","root@localhost").trim();
// getting the sendmail configuration
// get the sendmail configuration
final String sendMailStr = sb.getConfig("msgForwardingCmd","/usr/bin/sendmail")+" "+sendMailTo;
final String[] sendMail = sendMailStr.trim().split(" ");

@ -31,6 +31,7 @@ import java.io.IOException;
import de.anomic.http.metadata.HeaderFramework;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
@ -85,7 +86,7 @@ public final class query {
if (obj.equals("rwiurlcount")) try {
// the total number of different urls in the rwi is returned
// <env> shall contain a word hash, the number of assigned lurls to this hash is returned
prop.put("response", sb.indexSegment.termIndex().get(env.getBytes(), null).size());
prop.put("response", sb.indexSegments.termIndex(Segments.Process.PUBLIC).get(env.getBytes(), null).size());
return prop;
} catch (IOException e) {
e.printStackTrace();
@ -93,13 +94,13 @@ public final class query {
if (obj.equals("rwicount")) {
// return the total number of available word indexes
prop.put("response", sb.indexSegment.termIndex().sizesMax());
prop.put("response", sb.indexSegments.termIndex(Segments.Process.PUBLIC).sizesMax());
return prop;
}
if (obj.equals("lurlcount")) {
// return the number of all available l-url's
prop.put("response", sb.indexSegment.urlMetadata().size());
prop.put("response", sb.indexSegments.urlMetadata(Segments.Process.PUBLIC).size());
return prop;
}

@ -41,6 +41,7 @@ import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.text.referencePrototype.WordReference;
import de.anomic.kelondro.util.SortStack;
import de.anomic.net.natLib;
@ -216,7 +217,7 @@ public final class search {
final long timer = System.currentTimeMillis();
//final Map<byte[], ReferenceContainer<WordReference>>[] containers = sb.indexSegment.index().searchTerm(theQuery.queryHashes, theQuery.excludeHashes, plasmaSearchQuery.hashes2StringSet(urls));
final HashMap<byte[], ReferenceContainer<WordReference>> incc = sb.indexSegment.termIndex().searchConjunction(theQuery.queryHashes, QueryParams.hashes2StringSet(urls));
final HashMap<byte[], ReferenceContainer<WordReference>> incc = sb.indexSegments.termIndex(Segments.Process.PUBLIC).searchConjunction(theQuery.queryHashes, QueryParams.hashes2StringSet(urls));
serverProfiling.update("SEARCH", new ProfilingGraph.searchEvent(theQuery.id(true), SearchEvent.COLLECTION, incc.size(), System.currentTimeMillis() - timer), false);
if (incc != null) {
@ -269,7 +270,7 @@ public final class search {
RSSFeed.channels(RSSFeed.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), QueryParams.anonymizedQueryHashes(theQuery.queryHashes), ""));
// make event
theSearch = SearchEventCache.getEvent(theQuery, sb.indexSegment, sb.peers, sb.crawlResults, null, true);
theSearch = SearchEventCache.getEvent(theQuery, sb.indexSegments.segment(Segments.Process.PUBLIC), sb.peers, sb.crawlResults, null, true);
// set statistic details of search result and find best result index set
if (theSearch.getRankingResult().getLocalResourceSize() == 0) {

@ -36,6 +36,7 @@ import de.anomic.content.RSSMessage;
import de.anomic.data.Blacklist;
import de.anomic.document.parser.xml.RSSFeed;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.search.Switchboard;
@ -111,9 +112,9 @@ public final class transferRWI {
sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". Not granted. This peer is in robinson mode");
result = "not_granted";
pause = 60000;
} else if (sb.indexSegment.termIndex().getBufferSize() > cachelimit) {
} else if (sb.indexSegments.termIndex(Segments.Process.DHTIN).getBufferSize() > cachelimit) {
// we are too busy to receive indexes
sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". We are too busy (buffersize=" + sb.indexSegment.termIndex().getBufferSize() + ").");
sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". We are too busy (buffersize=" + sb.indexSegments.termIndex(Segments.Process.DHTIN).getBufferSize() + ").");
granted = false; // don't accept more words if there are too many words to flush
result = "busy";
pause = 60000;
@ -180,7 +181,7 @@ public final class transferRWI {
// learn entry
try {
sb.indexSegment.termIndex().add(wordHash.getBytes(), iEntry);
sb.indexSegments.termIndex(Segments.Process.DHTIN).add(wordHash.getBytes(), iEntry);
} catch (IOException e) {
e.printStackTrace();
}
@ -188,7 +189,7 @@ public final class transferRWI {
// check if we need to ask for the corresponding URL
if (!(knownURL.contains(urlHash)||unknownURL.contains(urlHash))) try {
if (sb.indexSegment.urlMetadata().exists(urlHash)) {
if (sb.indexSegments.urlMetadata(Segments.Process.DHTIN).exists(urlHash)) {
knownURL.add(urlHash);
} else {
unknownURL.add(urlHash);
@ -221,7 +222,7 @@ public final class transferRWI {
}
result = "ok";
pause = (int) (sb.indexSegment.termIndex().getBufferSize() * 20000 / sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 100000)); // estimation of necessary pause time
pause = (int) (sb.indexSegments.termIndex(Segments.Process.DHTIN).getBufferSize() * 20000 / sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 100000)); // estimation of necessary pause time
}
prop.put("unknownURL", unknownURLs.toString());

@ -33,6 +33,7 @@ import de.anomic.content.RSSMessage;
import de.anomic.data.Blacklist;
import de.anomic.document.parser.xml.RSSFeed;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.search.Switchboard;
@ -44,7 +45,6 @@ import de.anomic.yacy.yacyNetwork;
import de.anomic.yacy.yacySeed;
public final class transferURL {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) throws InterruptedException {
final long start = System.currentTimeMillis();
@ -81,7 +81,7 @@ public final class transferURL {
} else {
int received = 0;
int blocked = 0;
final int sizeBefore = sb.indexSegment.urlMetadata().size();
final int sizeBefore = sb.indexSegments.urlMetadata(Segments.Process.DHTIN).size();
// read the urls from the other properties and store
String urls;
URLMetadataRow lEntry;
@ -139,7 +139,7 @@ public final class transferURL {
// write entry to database
yacyCore.log.logInfo("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.metadata().url().toNormalform(true, false));
try {
sb.indexSegment.urlMetadata().store(lEntry);
sb.indexSegments.urlMetadata(Segments.Process.DHTIN).store(lEntry);
sb.crawlResults.stack(lEntry, iam, iam, 3);
if (yacyCore.log.isFine()) yacyCore.log.logFine("transferURL: received URL '" + metadata.url().toNormalform(false, true) + "' from peer " + otherPeerName);
received++;
@ -151,7 +151,7 @@ public final class transferURL {
sb.peers.mySeed().incRU(received);
// return rewrite properties
final int more = sb.indexSegment.urlMetadata().size() - sizeBefore;
final int more = sb.indexSegments.urlMetadata(Segments.Process.DHTIN).size() - sizeBefore;
doublevalues = Integer.toString(received - more);
yacyCore.log.logInfo("Received " + received + " URLs from peer " + otherPeerName + " in " + (System.currentTimeMillis() - start) + " ms, blocked " + blocked + " URLs");
RSSFeed.channels(RSSFeed.INDEXRECEIVE).addMessage(new RSSMessage("Received " + received + " URLs from peer " + otherPeerName + ", blocked " + blocked, "", ""));

@ -30,6 +30,7 @@ import java.util.Date;
import de.anomic.crawler.NoticedURL;
import de.anomic.crawler.retrieval.Request;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.search.Switchboard;
@ -75,7 +76,7 @@ public class urls {
if (entry == null) break;
// find referrer, if there is one
referrer = sb.getURL(entry.referrerhash());
referrer = sb.getURL(Segments.Process.PUBLIC, entry.referrerhash());
// place url to notice-url db
sb.crawlQueues.delegatedURL.push(
@ -112,10 +113,10 @@ public class urls {
URLMetadataRow.Components metadata;
yacyURL referrer;
for (int i = 0; i < count; i++) {
entry = sb.indexSegment.urlMetadata().load(urlhashes.substring(12 * i, 12 * (i + 1)), null, 0);
entry = sb.indexSegments.urlMetadata(Segments.Process.PUBLIC).load(urlhashes.substring(12 * i, 12 * (i + 1)), null, 0);
if (entry == null) continue;
// find referrer, if there is one
referrer = sb.getURL(entry.referrerHash());
referrer = sb.getURL(Segments.Process.PUBLIC, entry.referrerHash());
// create RSS entry
metadata = entry.metadata();
prop.put("item_" + c + "_title", metadata.dc_title());

@ -45,6 +45,8 @@ import de.anomic.document.parser.xml.RSSFeed;
import de.anomic.http.metadata.HeaderFramework;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.kelondro.util.SetTools;
@ -96,11 +98,23 @@ public class yacysearch {
boolean fetchSnippets = (post != null && post.get("verify", "false").equals("true"));
final serverObjects prop = new serverObjects();
// get segment
Segment indexSegment = null;
if (post != null && post.containsKey("segment")) {
String segmentName = post.get("segment");
if (sb.indexSegments.segmentExist(segmentName)) {
indexSegment = sb.indexSegments.segment(segmentName);
}
} else {
// take default segment
indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
}
//final boolean rss = (post == null) ? false : post.get("rss", "false").equals("true");
prop.put("promoteSearchPageGreeting", promoteSearchPageGreeting);
prop.put("promoteSearchPageGreeting.homepage", sb.getConfig(SwitchboardConstants.GREETING_HOMEPAGE, ""));
prop.put("promoteSearchPageGreeting.smallImage", sb.getConfig(SwitchboardConstants.GREETING_SMALL_IMAGE, ""));
if ((post == null) || (env == null) || (!searchAllowed)) {
if (post == null || indexSegment == null || env == null || !searchAllowed) {
// we create empty entries for template strings
prop.put("searchagain", "0");
prop.put("display", display);
@ -235,7 +249,7 @@ public class yacysearch {
// check available memory and clean up if necessary
if (!MemoryControl.request(8000000L, false)) {
sb.indexSegment.urlMetadata().clearCache();
indexSegment.urlMetadata().clearCache();
SearchEventCache.cleanupEvents(true);
}
@ -374,7 +388,7 @@ public class yacysearch {
// delete the index entry locally
final String delHash = post.get("deleteref", ""); // urlhash
sb.indexSegment.termIndex().remove(Word.words2hashes(query[0]), delHash);
indexSegment.termIndex().remove(Word.words2hashes(query[0]), delHash);
// make new news message with negative voting
final HashMap<String, String> map = new HashMap<String, String>();
@ -393,7 +407,7 @@ public class yacysearch {
return prop;
}
final String recommendHash = post.get("recommendref", ""); // urlhash
final URLMetadataRow urlentry = sb.indexSegment.urlMetadata().load(recommendHash, null, 0);
final URLMetadataRow urlentry = indexSegment.urlMetadata().load(recommendHash, null, 0);
if (urlentry != null) {
final URLMetadataRow.Components metadata = urlentry.metadata();
Document document;
@ -462,7 +476,7 @@ public class yacysearch {
theQuery.setOffset(0); // in case that this is a new search, always start without a offset
offset = 0;
}
final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.indexSegment, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false);
final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, indexSegment, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false);
// generate result object
//serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + (System.currentTimeMillis() - timestamp) + " ms");
@ -490,7 +504,7 @@ public class yacysearch {
}
prop.put("meanCount", meanMax);
if (meanMax > 0) {
DidYouMean didYouMean = new DidYouMean(sb.indexSegment.termIndex());
DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex());
Iterator<String> meanIt = didYouMean.getSuggestions(querystring, 300, 10).iterator();
int meanCount = 0;
String suggestion;

@ -71,6 +71,59 @@ public class PMHReader {
} catch (InterruptedException e) {}
}
public static StringBuilder escape(final String s) {
final int len = s.length();
final StringBuilder sbuf = new StringBuilder(len + 10);
for (int i = 0; i < len; i++) {
final int ch = s.charAt(i);
if ('A' <= ch && ch <= 'Z') { // 'A'..'Z'
sbuf.append((char)ch);
} else if ('a' <= ch && ch <= 'z') { // 'a'..'z'
sbuf.append((char)ch);
} else if ('0' <= ch && ch <= '9') { // '0'..'9'
sbuf.append((char)ch);
} else if (ch == ' ') { // space
sbuf.append("%20");
} else if (ch == '&' || ch == ':' // unreserved
|| ch == '-' || ch == '_'
|| ch == '.' || ch == '!'
|| ch == '~' || ch == '*'
|| ch == '\'' || ch == '('
|| ch == ')' || ch == ';') {
sbuf.append((char)ch);
}
}
return sbuf;
}
public static String unescape(final String s) {
final int l = s.length();
final StringBuilder sbuf = new StringBuilder(l);
int ch = -1;
int b, sumb = 0;
for (int i = 0, more = -1; i < l; i++) {
/* Get next byte b from URL segment s */
switch (ch = s.charAt(i)) {
case '%':
if (i + 2 < l) {
ch = s.charAt(++i);
int hb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF;
ch = s.charAt(++i);
int lb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase ((char) ch) - 'a') & 0xF;
b = (hb << 4) | lb;
} else {
b = ch;
}
break;
case '+':
b = ' ';
break;
default:
b = ch;
}
}
return sbuf.toString();
}
public static void main(String[] args) {
// get one server with
// http://roar.eprints.org/index.php?action=csv

@ -0,0 +1,209 @@
//RobotsEntry.java
//-------------------------------------
//part of YACY
//(C) by Michael Peter Christen; mc@yacy.net
//first published on http://www.anomic.de
//Frankfurt, Germany, 2004
//
//This file is contributed by Martin Thelian
// [MC] moved some methods from robotsParser file that had been created by Alexander Schier to this class
// [MC] redesign: removed entry object from RobotsTxt Class into ths separate class
//last major change: $LastChangedDate$ by $LastChangedBy$
//Revision: $LastChangedRevision$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.crawler;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
public class RobotsEntry {
public static final String ROBOTS_DB_PATH_SEPARATOR = ";";
public static final String ALLOW_PATH_LIST = "allow";
public static final String DISALLOW_PATH_LIST = "disallow";
public static final String LOADED_DATE = "date";
public static final String MOD_DATE = "modDate";
public static final String ETAG = "etag";
public static final String SITEMAP = "sitemap";
public static final String CRAWL_DELAY = "crawlDelay";
public static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis";
// this is a simple record structure that holds all properties of a single crawl start
Map<String, String> mem;
private LinkedList<String> allowPathList, denyPathList;
String hostName;
public RobotsEntry(final String hostName, final Map<String, String> mem) {
this.hostName = hostName.toLowerCase();
this.mem = mem;
if (this.mem.containsKey(DISALLOW_PATH_LIST)) {
this.denyPathList = new LinkedList<String>();
final String csPl = this.mem.get(DISALLOW_PATH_LIST);
if (csPl.length() > 0){
final String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR);
if ((pathArray != null)&&(pathArray.length > 0)) {
this.denyPathList.addAll(Arrays.asList(pathArray));
}
}
} else {
this.denyPathList = new LinkedList<String>();
}
if (this.mem.containsKey(ALLOW_PATH_LIST)) {
this.allowPathList = new LinkedList<String>();
final String csPl = this.mem.get(ALLOW_PATH_LIST);
if (csPl.length() > 0){
final String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR);
if ((pathArray != null)&&(pathArray.length > 0)) {
this.allowPathList.addAll(Arrays.asList(pathArray));
}
}
} else {
this.allowPathList = new LinkedList<String>();
}
}
public RobotsEntry(
final String hostName,
final ArrayList<String> allowPathList,
final ArrayList<String> disallowPathList,
final Date loadedDate,
final Date modDate,
final String eTag,
final String sitemap,
final long crawlDelayMillis
) {
if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException("The hostname is missing");
this.hostName = hostName.trim().toLowerCase();
this.allowPathList = new LinkedList<String>();
this.denyPathList = new LinkedList<String>();
this.mem = new HashMap<String, String>(5);
if (loadedDate != null) this.mem.put(LOADED_DATE,Long.toString(loadedDate.getTime()));
if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime()));
if (eTag != null) this.mem.put(ETAG,eTag);
if (sitemap != null) this.mem.put(SITEMAP,sitemap);
if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, Long.toString(crawlDelayMillis));
if ((allowPathList != null)&&(allowPathList.size()>0)) {
this.allowPathList.addAll(allowPathList);
final StringBuilder pathListStr = new StringBuilder();
for (int i=0; i<allowPathList.size();i++) {
pathListStr.append(allowPathList.get(i))
.append(ROBOTS_DB_PATH_SEPARATOR);
}
this.mem.put(ALLOW_PATH_LIST,pathListStr.substring(0,pathListStr.length()-1));
}
if ((disallowPathList != null)&&(disallowPathList.size()>0)) {
this.denyPathList.addAll(disallowPathList);
final StringBuilder pathListStr = new StringBuilder();
for (int i=0; i<disallowPathList.size();i++) {
pathListStr.append(disallowPathList.get(i))
.append(ROBOTS_DB_PATH_SEPARATOR);
}
this.mem.put(DISALLOW_PATH_LIST,pathListStr.substring(0,pathListStr.length()-1));
}
}
public String toString() {
final StringBuilder str = new StringBuilder();
str.append((this.hostName==null)?"null":this.hostName)
.append(": ");
if (this.mem != null) {
str.append(this.mem.toString());
}
return str.toString();
}
public String getSitemap() {
return this.mem.containsKey(SITEMAP)? this.mem.get(SITEMAP): null;
}
public Date getLoadedDate() {
if (this.mem.containsKey(LOADED_DATE)) {
return new Date(Long.valueOf(this.mem.get(LOADED_DATE)).longValue());
}
return null;
}
public void setLoadedDate(final Date newLoadedDate) {
if (newLoadedDate != null) {
this.mem.put(LOADED_DATE,Long.toString(newLoadedDate.getTime()));
}
}
public Date getModDate() {
if (this.mem.containsKey(MOD_DATE)) {
return new Date(Long.valueOf(this.mem.get(MOD_DATE)).longValue());
}
return null;
}
public String getETag() {
if (this.mem.containsKey(ETAG)) {
return this.mem.get(ETAG);
}
return null;
}
public long getCrawlDelayMillis() {
if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try {
return Long.parseLong(this.mem.get(CRAWL_DELAY_MILLIS));
} catch (final NumberFormatException e) {
return 0;
}
if (this.mem.containsKey(CRAWL_DELAY)) try {
return 1000 * Integer.parseInt(this.mem.get(CRAWL_DELAY));
} catch (final NumberFormatException e) {
return 0;
}
return 0;
}
public boolean isDisallowed(String path) {
if ((this.mem == null) || (this.denyPathList.size() == 0)) return false;
// if the path is null or empty we set it to /
if ((path == null) || (path.length() == 0)) path = "/";
// escaping all occurences of ; because this char is used as special char in the Robots DB
else path = path.replaceAll(ROBOTS_DB_PATH_SEPARATOR,"%3B");
final Iterator<String> pathIter = this.denyPathList.iterator();
while (pathIter.hasNext()) {
final String nextPath = pathIter.next();
// disallow rule
if (path.startsWith(nextPath)) {
return true;
}
}
return false;
}
}

@ -32,11 +32,7 @@ import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
@ -109,12 +105,12 @@ public class RobotsTxt {
return this.robotsTable.size();
}
private Entry getEntry(final String urlHostPort, final boolean fetchOnlineIfNotAvailableOrNotFresh) {
private RobotsEntry getEntry(final String urlHostPort, final boolean fetchOnlineIfNotAvailableOrNotFresh) {
// this method will always return a non-null value
Entry robotsTxt4Host = null;
RobotsEntry robotsTxt4Host = null;
try {
final Map<String, String> record = this.robotsTable.get(urlHostPort);
if (record != null) robotsTxt4Host = new Entry(urlHostPort, record);
if (record != null) robotsTxt4Host = new RobotsEntry(urlHostPort, record);
} catch (final kelondroException e) {
resetDatabase();
} catch (final IOException e) {
@ -143,7 +139,7 @@ public class RobotsTxt {
// to complete a download
try {
final Map<String, String> record = this.robotsTable.get(urlHostPort);
if (record != null) robotsTxt4Host = new Entry(urlHostPort, record);
if (record != null) robotsTxt4Host = new RobotsEntry(urlHostPort, record);
} catch (final kelondroException e) {
resetDatabase();
} catch (final IOException e) {
@ -185,7 +181,7 @@ public class RobotsTxt {
// no robots.txt available, make an entry to prevent that the robots loading is done twice
if (robotsTxt4Host == null) {
// generate artificial entry
robotsTxt4Host = new Entry(
robotsTxt4Host = new RobotsEntry(
urlHostPort,
new ArrayList<String>(),
new ArrayList<String>(),
@ -233,11 +229,11 @@ public class RobotsTxt {
public long crawlDelayMillis(final yacyURL theURL) {
final String urlHostPort = getHostPort(theURL);
final RobotsTxt.Entry robotsEntry = getEntry(urlHostPort, true);
final RobotsEntry robotsEntry = getEntry(urlHostPort, true);
return robotsEntry.getCrawlDelayMillis();
}
private Entry addEntry(
private RobotsEntry addEntry(
final String hostName,
final ArrayList<String> allowPathList,
final ArrayList<String> denyPathList,
@ -247,14 +243,14 @@ public class RobotsTxt {
final String sitemap,
final long crawlDelayMillis
) {
final Entry entry = new Entry(
final RobotsEntry entry = new RobotsEntry(
hostName, allowPathList, denyPathList, loadedDate, modDate,
eTag, sitemap, crawlDelayMillis);
addEntry(entry);
return entry;
}
private String addEntry(final Entry entry) {
private String addEntry(final RobotsEntry entry) {
// writes a new page and returns key
try {
this.robotsTable.put(entry.hostName, entry.mem);
@ -264,176 +260,6 @@ public class RobotsTxt {
}
}
public static class Entry {
public static final String ALLOW_PATH_LIST = "allow";
public static final String DISALLOW_PATH_LIST = "disallow";
public static final String LOADED_DATE = "date";
public static final String MOD_DATE = "modDate";
public static final String ETAG = "etag";
public static final String SITEMAP = "sitemap";
public static final String CRAWL_DELAY = "crawlDelay";
public static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis";
// this is a simple record structure that holds all properties of a single crawl start
Map<String, String> mem;
private LinkedList<String> allowPathList, denyPathList;
String hostName;
public Entry(final String hostName, final Map<String, String> mem) {
this.hostName = hostName.toLowerCase();
this.mem = mem;
if (this.mem.containsKey(DISALLOW_PATH_LIST)) {
this.denyPathList = new LinkedList<String>();
final String csPl = this.mem.get(DISALLOW_PATH_LIST);
if (csPl.length() > 0){
final String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR);
if ((pathArray != null)&&(pathArray.length > 0)) {
this.denyPathList.addAll(Arrays.asList(pathArray));
}
}
} else {
this.denyPathList = new LinkedList<String>();
}
if (this.mem.containsKey(ALLOW_PATH_LIST)) {
this.allowPathList = new LinkedList<String>();
final String csPl = this.mem.get(ALLOW_PATH_LIST);
if (csPl.length() > 0){
final String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR);
if ((pathArray != null)&&(pathArray.length > 0)) {
this.allowPathList.addAll(Arrays.asList(pathArray));
}
}
} else {
this.allowPathList = new LinkedList<String>();
}
}
public Entry(
final String hostName,
final ArrayList<String> allowPathList,
final ArrayList<String> disallowPathList,
final Date loadedDate,
final Date modDate,
final String eTag,
final String sitemap,
final long crawlDelayMillis
) {
if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException("The hostname is missing");
this.hostName = hostName.trim().toLowerCase();
this.allowPathList = new LinkedList<String>();
this.denyPathList = new LinkedList<String>();
this.mem = new HashMap<String, String>(5);
if (loadedDate != null) this.mem.put(LOADED_DATE,Long.toString(loadedDate.getTime()));
if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime()));
if (eTag != null) this.mem.put(ETAG,eTag);
if (sitemap != null) this.mem.put(SITEMAP,sitemap);
if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, Long.toString(crawlDelayMillis));
if ((allowPathList != null)&&(allowPathList.size()>0)) {
this.allowPathList.addAll(allowPathList);
final StringBuilder pathListStr = new StringBuilder();
for (int i=0; i<allowPathList.size();i++) {
pathListStr.append(allowPathList.get(i))
.append(ROBOTS_DB_PATH_SEPARATOR);
}
this.mem.put(ALLOW_PATH_LIST,pathListStr.substring(0,pathListStr.length()-1));
}
if ((disallowPathList != null)&&(disallowPathList.size()>0)) {
this.denyPathList.addAll(disallowPathList);
final StringBuilder pathListStr = new StringBuilder();
for (int i=0; i<disallowPathList.size();i++) {
pathListStr.append(disallowPathList.get(i))
.append(ROBOTS_DB_PATH_SEPARATOR);
}
this.mem.put(DISALLOW_PATH_LIST,pathListStr.substring(0,pathListStr.length()-1));
}
}
public String toString() {
final StringBuilder str = new StringBuilder();
str.append((this.hostName==null)?"null":this.hostName)
.append(": ");
if (this.mem != null) {
str.append(this.mem.toString());
}
return str.toString();
}
public String getSitemap() {
return this.mem.containsKey(SITEMAP)? this.mem.get(SITEMAP): null;
}
public Date getLoadedDate() {
if (this.mem.containsKey(LOADED_DATE)) {
return new Date(Long.valueOf(this.mem.get(LOADED_DATE)).longValue());
}
return null;
}
public void setLoadedDate(final Date newLoadedDate) {
if (newLoadedDate != null) {
this.mem.put(LOADED_DATE,Long.toString(newLoadedDate.getTime()));
}
}
public Date getModDate() {
if (this.mem.containsKey(MOD_DATE)) {
return new Date(Long.valueOf(this.mem.get(MOD_DATE)).longValue());
}
return null;
}
public String getETag() {
if (this.mem.containsKey(ETAG)) {
return this.mem.get(ETAG);
}
return null;
}
public long getCrawlDelayMillis() {
if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try {
return Long.parseLong(this.mem.get(CRAWL_DELAY_MILLIS));
} catch (final NumberFormatException e) {
return 0;
}
if (this.mem.containsKey(CRAWL_DELAY)) try {
return 1000 * Integer.parseInt(this.mem.get(CRAWL_DELAY));
} catch (final NumberFormatException e) {
return 0;
}
return 0;
}
public boolean isDisallowed(String path) {
if ((this.mem == null) || (this.denyPathList.size() == 0)) return false;
// if the path is null or empty we set it to /
if ((path == null) || (path.length() == 0)) path = "/";
// escaping all occurences of ; because this char is used as special char in the Robots DB
else path = path.replaceAll(ROBOTS_DB_PATH_SEPARATOR,"%3B");
final Iterator<String> pathIter = this.denyPathList.iterator();
while (pathIter.hasNext()) {
final String nextPath = pathIter.next();
// disallow rule
if (path.startsWith(nextPath)) {
return true;
}
}
return false;
}
}
// methods that had been in robotsParser.java:
public static final int DOWNLOAD_ACCESS_RESTRICTED = 0;
@ -469,7 +295,7 @@ public class RobotsTxt {
// generating the hostname:poart string needed to do a DB lookup
final String urlHostPort = getHostPort(theURL);
final RobotsTxt.Entry robotsTxt4Host = this.getEntry(urlHostPort, true);
final RobotsEntry robotsTxt4Host = this.getEntry(urlHostPort, true);
try {
final String sitemapUrlStr = robotsTxt4Host.getSitemap();
@ -485,7 +311,7 @@ public class RobotsTxt {
// generating the hostname:poart string needed to do a DB lookup
final String urlHostPort = getHostPort(theURL);
final RobotsTxt.Entry robotsTxt4Host = getEntry(urlHostPort, true);
final RobotsEntry robotsTxt4Host = getEntry(urlHostPort, true);
try {
crawlDelay = robotsTxt4Host.getCrawlDelayMillis();
@ -499,12 +325,12 @@ public class RobotsTxt {
// generating the hostname:port string needed to do a DB lookup
final String urlHostPort = getHostPort(nexturl);
RobotsTxt.Entry robotsTxt4Host = null;
RobotsEntry robotsTxt4Host = null;
robotsTxt4Host = getEntry(urlHostPort, true);
return robotsTxt4Host.isDisallowed(nexturl.getFile());
}
private static Object[] downloadRobotsTxt(final yacyURL robotsURL, int redirectionCount, final RobotsTxt.Entry entry) throws Exception {
private static Object[] downloadRobotsTxt(final yacyURL robotsURL, int redirectionCount, final RobotsEntry entry) throws Exception {
if (redirectionCount < 0) return new Object[]{Boolean.FALSE,null,null};
redirectionCount--;

@ -37,6 +37,7 @@ import de.anomic.document.Parser;
import de.anomic.http.metadata.HeaderFramework;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.http.metadata.ResponseHeader;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.net.ftpc;
import de.anomic.search.Switchboard;
@ -108,7 +109,7 @@ public class FTPLoader {
if (file.length() == 0) {
// directory -> get list of files
RequestHeader requestHeader = new RequestHeader();
if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(request.referrerhash()).toNormalform(true, false));
if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()).toNormalform(true, false));
byte[] dirList = generateDirlist(ftpClient, request, path);
@ -242,7 +243,7 @@ public class FTPLoader {
// create a cache entry
RequestHeader requestHeader = new RequestHeader();
if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(request.referrerhash()).toNormalform(true, false));
if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()).toNormalform(true, false));
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(fileDate));
responseHeader.put(HeaderFramework.CONTENT_TYPE, mimeType);

@ -35,6 +35,7 @@ import de.anomic.http.client.Client;
import de.anomic.http.metadata.HeaderFramework;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.http.metadata.ResponseContainer;
import de.anomic.kelondro.text.Segments;
import de.anomic.search.Switchboard;
import de.anomic.yacy.yacyURL;
import de.anomic.yacy.logging.Log;
@ -118,7 +119,7 @@ public final class HTTPLoader {
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, crawlerUserAgent);
yacyURL refererURL = null;
if (request.referrerhash() != null) refererURL = sb.getURL(request.referrerhash());
if (request.referrerhash() != null) refererURL = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true));
requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE));
requestHeader.put(HeaderFramework.ACCEPT_CHARSET, sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET));
@ -196,7 +197,7 @@ public final class HTTPLoader {
final String urlhash = redirectionUrl.hash();
// check if the url was already indexed
final String dbname = sb.urlExists(urlhash);
final String dbname = sb.urlExists(Segments.Process.LOCALCRAWLING, urlhash);
if (dbname != null) {
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "redirection to double content");
throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname);

@ -43,6 +43,7 @@ import de.anomic.http.client.Cache;
import de.anomic.http.metadata.HeaderFramework;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.http.metadata.ResponseHeader;
import de.anomic.kelondro.text.Segments;
import de.anomic.search.Switchboard;
import de.anomic.server.serverCore;
import de.anomic.yacy.yacyURL;
@ -167,7 +168,7 @@ public final class LoaderDispatcher {
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent);
yacyURL refererURL = null;
if (request.referrerhash() != null) refererURL = sb.getURL(request.referrerhash());
if (request.referrerhash() != null) refererURL = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true));
Response response = new Response(
request,

@ -46,6 +46,7 @@ import de.anomic.http.io.ByteCountInputStream;
import de.anomic.http.metadata.HeaderFramework;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.http.metadata.ResponseContainer;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.search.Switchboard;
@ -258,10 +259,10 @@ public class SitemapParser extends DefaultHandler {
// check if the url is known and needs to be recrawled
if (this.lastMod != null) {
final String dbocc = this.sb.urlExists(nexturlhash);
final String dbocc = this.sb.urlExists(Segments.Process.LOCALCRAWLING, nexturlhash);
if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
// the url was already loaded. we need to check the date
final URLMetadataRow oldEntry = this.sb.indexSegment.urlMetadata().load(nexturlhash, null, 0);
final URLMetadataRow oldEntry = this.sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(nexturlhash, null, 0);
if (oldEntry != null) {
final Date modDate = oldEntry.moddate();
// check if modDate is null

@ -411,8 +411,8 @@ public class URLAnalysis {
public static int diffurlcol(String metadataPath, String statisticFile, String diffFile) throws IOException {
System.out.println("INDEX DIFF URL-COL startup");
HandleMap idx = new HandleMap(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, 4, new File(statisticFile), 0);
MetadataRepository mr = new MetadataRepository(new File(metadataPath), false, false);
HandleMap idx = new HandleMap(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, 4, new File(statisticFile), 0);
MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false);
HandleSet hs = new HandleSet(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, 0, 1000000);
System.out.println("INDEX DIFF URL-COL loaded dump, starting diff");
long start = System.currentTimeMillis();
@ -438,8 +438,8 @@ public class URLAnalysis {
public static void export(String metadataPath, int format, String export, String diffFile) throws IOException {
// format: 0=text, 1=html, 2=rss/xml
System.out.println("URL EXPORT startup");
MetadataRepository mr = new MetadataRepository(new File(metadataPath), false, false);
System.out.println("URL EXPORT startup");
MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false);
HandleSet hs = (diffFile == null) ? null : new HandleSet(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, new File(diffFile), 0);
System.out.println("URL EXPORT loaded dump, starting export");
Export e = mr.export(new File(export), ".*", hs, format, false);
@ -452,8 +452,8 @@ public class URLAnalysis {
}
public static void delete(String metadataPath, String diffFile) throws IOException {
System.out.println("URL DELETE startup");
MetadataRepository mr = new MetadataRepository(new File(metadataPath), false, false);
System.out.println("URL DELETE startup");
MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false);
int mrSize = mr.size();
HandleSet hs = new HandleSet(URLMetadataRow.rowdef.primaryKeyLength, URLMetadataRow.rowdef.objectOrder, new File(diffFile), 0);
System.out.println("URL DELETE loaded dump, starting deletion of " + hs.size() + " entries from " + mrSize);

@ -70,6 +70,7 @@ import de.anomic.kelondro.blob.Heap;
import de.anomic.kelondro.blob.MapView;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.order.NaturalOrder;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.kelondroException;
import de.anomic.kelondro.util.FileUtils;
@ -255,7 +256,7 @@ public class bookmarksDB {
Pattern.compile(newcrawlingMustMatch);
String urlhash = crawlingStartURL.hash();
sb.indexSegment.urlMetadata().remove(urlhash);
sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).remove(urlhash);
sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
sb.crawlQueues.errorURL.remove(urlhash);

@ -1,6 +1,5 @@
// this is a temporary 1-to-1 copy of the transferURL servlet
package de.anomic.http.server.servlets;
import java.io.IOException;
@ -10,6 +9,7 @@ import de.anomic.content.RSSMessage;
import de.anomic.data.Blacklist;
import de.anomic.document.parser.xml.RSSFeed;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.search.Switchboard;
@ -20,32 +20,32 @@ import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyNetwork;
import de.anomic.yacy.yacySeed;
public final class transferURL {
public final class transferURL {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) throws InterruptedException {
final long start = System.currentTimeMillis();
long freshdate = 0;
@ -81,7 +81,7 @@ public final class transferURL {
} else {
int received = 0;
int blocked = 0;
final int sizeBefore = sb.indexSegment.urlMetadata().size();
final int sizeBefore = sb.indexSegments.urlMetadata(Segments.Process.DHTIN).size();
// read the urls from the other properties and store
String urls;
URLMetadataRow lEntry;
@ -139,7 +139,7 @@ public final class transferURL {
// write entry to database
yacyCore.log.logInfo("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.metadata().url().toNormalform(true, false));
try {
sb.indexSegment.urlMetadata().store(lEntry);
sb.indexSegments.urlMetadata(Segments.Process.DHTIN).store(lEntry);
sb.crawlResults.stack(lEntry, iam, iam, 3);
if (yacyCore.log.isFine()) yacyCore.log.logFine("transferURL: received URL '" + metadata.url().toNormalform(false, true) + "' from peer " + otherPeerName);
received++;
@ -151,7 +151,7 @@ public final class transferURL {
sb.peers.mySeed().incRU(received);
// return rewrite properties
final int more = sb.indexSegment.urlMetadata().size() - sizeBefore;
final int more = sb.indexSegments.urlMetadata(Segments.Process.DHTIN).size() - sizeBefore;
doublevalues = Integer.toString(received - more);
yacyCore.log.logInfo("Received " + received + " URLs from peer " + otherPeerName + " in " + (System.currentTimeMillis() - start) + " ms, blocked " + blocked + " URLs");
RSSFeed.channels(RSSFeed.INDEXRECEIVE).addMessage(new RSSMessage("Received " + received + " URLs from peer " + otherPeerName + ", blocked " + blocked, "", ""));
@ -163,4 +163,4 @@ public final class transferURL {
prop.put("result", result);
return prop;
}
}
}

@ -450,6 +450,7 @@ public class ArrayStack implements BLOB {
* @return the number of entries in each blob
*/
public synchronized int[] sizes() {
if (blobs == null) return new int[0];
int[] s = new int[blobs.size()];
int c = 0;
for (blobItem bi: blobs) s[c++] = bi.blob.size();

@ -126,7 +126,7 @@ public class Compressor implements BLOB {
private byte[] decompress(byte[] b) {
// use a magic in the head of the bytes to identify compression type
if (b == null) return null;
if (ByteArray.equals(b, gzipMagic)) {
if (ByteArray.startsWith(b, gzipMagic)) {
//System.out.print("\\"); // DEBUG
cdr--;
ByteArrayInputStream bais = new ByteArrayInputStream(b);
@ -150,7 +150,7 @@ public class Compressor implements BLOB {
e.printStackTrace();
return null;
}
} else if (ByteArray.equals(b, plainMagic)) {
} else if (ByteArray.startsWith(b, plainMagic)) {
//System.out.print("-"); // DEBUG
byte[] r = new byte[b.length - 2];
System.arraycopy(b, 2, r, 0, b.length - 2);

@ -173,7 +173,7 @@ public class Column {
else if (this.celltype == celltype_boolean) this.encoder = encoder_bytes;
else if (this.celltype == celltype_binary) this.encoder = encoder_bytes;
else if (this.celltype == celltype_string) this.encoder = encoder_bytes;
else throw new kelondroException("kelondroColumn - encoder missing for cell " + this.nickname);
else throw new kelondroException("kelondroColumn - encoder missing for cell '" + this.nickname + "'");
}
} else {
if (this.celltype == celltype_cardinal) throw new kelondroException("kelondroColumn - encoder missing for cell " + this.nickname);

@ -380,11 +380,11 @@ public final class Row {
final Object[] ref = nickref.get(nickname);
if (ref == null) return;
final Column col = (Column) ref[0];
setCol(col.encoder, ((Integer) ref[1]).intValue(), col.cellwidth, cell);
setCol(((Integer) ref[1]).intValue(), col.cellwidth, cell);
}
public final void setCol(final int column, final byte[] cell) {
setCol(row[column].encoder, colstart[column], row[column].cellwidth, cell);
setCol(colstart[column], row[column].cellwidth, cell);
}
public final void setCol(final int column, final char[] cell) {
@ -393,7 +393,7 @@ public final class Row {
for (int i = cell.length; i < row[column].cellwidth; i++) rowinstance[offset + clstrt + i] = 0;
}
private final void setCol(final int encoding, final int clstrt, int length, final byte[] cell) {
private final void setCol(final int clstrt, int length, final byte[] cell) {
if (cell == null) {
while (length-- > 0) rowinstance[offset + clstrt + length] = 0;
} else {
@ -411,6 +411,10 @@ public final class Row {
rowinstance[offset + colstart[column]] = c;
}
public final void setCol(final int column, final String cell) {
setCol(column, cell.getBytes());
}
public final void setCol(final int column, final String cell, final String encoding) {
if (encoding == null)
setCol(column, cell.getBytes());

@ -122,7 +122,7 @@ public class DocumentIndex extends Segment {
* If the given file is a path to a directory, the complete sub-tree is indexed
* @param start
*/
public void addAll(File start) {
public void addConcurrent(File start) {
assert (start != null);
assert (start.canRead()) : start.toString();
if (!start.isDirectory()) {
@ -137,7 +137,7 @@ public class DocumentIndex extends Segment {
w = new File(start, t);
if (w.canRead() && !w.isHidden()) {
if (w.isDirectory()) {
addAll(w);
addConcurrent(w);
} else {
try {
this.queue.put(w);
@ -232,7 +232,7 @@ public class DocumentIndex extends Segment {
if (args[1].equals("add")) {
File f = new File(args[2]);
DocumentIndex di = new DocumentIndex(segmentPath);
di.addAll(f);
di.addConcurrent(f);
di.close();
} else {
String query = "";

@ -31,6 +31,7 @@ import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import de.anomic.kelondro.index.ARC;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.index.SimpleARC;
import de.anomic.kelondro.order.ByteOrder;
@ -64,11 +65,12 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
private long lastCleanup;
private final long targetFileSize, maxFileSize;
private final int writeBufferSize;
private final SimpleARC<ByteArray, Integer> countCache;
private final ARC<ByteArray, Integer> countCache;
private boolean cleanerRunning = false;
public IndexCell(
final File cellPath,
final String prefix,
final ReferenceFactory<ReferenceType> factory,
final ByteOrder termOrder,
final Row payloadrow,
@ -80,7 +82,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
) throws IOException {
super(factory);
this.array = new ReferenceContainerArray<ReferenceType>(cellPath, factory, termOrder, payloadrow, merger);
this.array = new ReferenceContainerArray<ReferenceType>(cellPath, prefix, factory, termOrder, payloadrow, merger);
this.ram = new ReferenceContainerCache<ReferenceType>(factory, payloadrow, termOrder);
this.maxRamEntries = maxRamEntries;
this.merger = merger;

@ -57,6 +57,7 @@ public final class ReferenceContainerArray<ReferenceType extends Reference> {
*/
public ReferenceContainerArray(
final File heapLocation,
final String prefix,
final ReferenceFactory<ReferenceType> factory,
final ByteOrder termOrder,
final Row payloadrow,
@ -65,7 +66,7 @@ public final class ReferenceContainerArray<ReferenceType extends Reference> {
this.payloadrow = payloadrow;
this.array = new ArrayStack(
heapLocation,
"index",
prefix,
payloadrow.primaryKeyLength,
termOrder,
0);
@ -217,6 +218,18 @@ public final class ReferenceContainerArray<ReferenceType extends Reference> {
return c;
}
/**
* calculate an upper limit for a ranking number of the container size
* the returned number is not a counter. It can only be used to compare the
* ReferenceContainer, that may be produced as a result of get()
* @param termHash
* @return a ranking number
* @throws IOException
*/
public long lenghtRankingUpperLimit(final byte[] termHash) throws IOException {
return this.array.lengthAdd(termHash);
}
/**
* delete a indexContainer from the heap cache. This can only be used for write-enabled heaps
* @param wordHash

@ -82,6 +82,9 @@ public class Segment {
final boolean useTailCache,
final boolean exceed134217727) throws IOException {
migrateTextIndex(segmentPath, segmentPath);
migrateTextMetadata(segmentPath, segmentPath);
log.logInfo("Initializing Segment '" + segmentPath + "', word hash cache size is " + Word.hashCacheSize + ".");
this.log = log;
@ -89,8 +92,10 @@ public class Segment {
this.merger = new IODispatcher(1, 1, writeBufferSize);
this.merger.start();
this.termIndex = new IndexCell<WordReference>(
new File(segmentPath, "RICELL"),
segmentPath,
"text.index",
wordReferenceFactory,
wordOrder,
WordReferenceRow.urlEntryRow,
@ -111,11 +116,35 @@ public class Segment {
this.merger,
writeBufferSize);
*/
File metadatadir = new File(segmentPath, "METADATA");
if (!metadatadir.exists()) metadatadir.mkdirs();
// create LURL-db
urlMetadata = new MetadataRepository(metadatadir, useTailCache, exceed134217727);
urlMetadata = new MetadataRepository(segmentPath, "text.urlmd", useTailCache, exceed134217727);
}
public static void migrateTextIndex(File oldSegmentPath, File newSegmentPath) {
File oldCellPath = new File(oldSegmentPath, "RICELL");
if (!oldCellPath.exists()) return;
String[] oldIndexFiles = oldCellPath.list();
for (String oldIndexFile: oldIndexFiles) {
if (oldIndexFile.startsWith("index.")) {
File newFile = new File(newSegmentPath, "text.index." + oldIndexFile.substring(6));
new File(oldCellPath, oldIndexFile).renameTo(newFile);
}
}
oldCellPath.delete();
}
public static void migrateTextMetadata(File oldSegmentPath, File newSegmentPath) {
File oldMetadataPath = new File(oldSegmentPath, "METADATA");
if (!oldMetadataPath.exists()) return;
String[] oldMetadataFiles = oldMetadataPath.list();
for (String oldMetadataFile: oldMetadataFiles) {
if (oldMetadataFile.startsWith("urls.")) {
File newFile = new File(newSegmentPath, "text.urlmd." + oldMetadataFile.substring(5));
new File(oldMetadataPath, oldMetadataFile).renameTo(newFile);
}
}
oldMetadataPath.delete();
}
public MetadataRepository urlMetadata() {
@ -431,5 +460,13 @@ public class Segment {
}
}
}
public int rwisize() {
return termIndex().sizesMax();
}
public int urlsize() {
return urlMetadata().size();
}
}
}

@ -0,0 +1,226 @@
// Segments.java
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 30.07.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-05-28 01:51:34 +0200 (Do, 28 Mai 2009) $
// $LastChangedRevision: 5988 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import de.anomic.document.Condenser;
import de.anomic.document.Document;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.text.referencePrototype.WordReference;
import de.anomic.yacy.yacyURL;
import de.anomic.yacy.logging.Log;
public final class Segments implements Iterable<Segment> {
/**
* process enumeration type
* defines constants that can be used to assign process-related segment names
*/
public enum Process {
RECEIPTS,
QUERIES,
DHTIN,
DHTOUT, // the only segment that is used for reading-only
PROXY,
LOCALCRAWLING,
REMOTECRAWLING,
PUBLIC; // includes the index that can be retrieved by the yacy p2p api
public String toString() {
throw new UnsupportedOperationException("toString not allowed");
}
}
private final Log log;
private final File segmentsPath;
private final int entityCacheMaxSize;
private final long maxFileSize;
private HashMap<String, Segment> segments;
private HashMap<Process, String> process_assignment;
private final boolean useTailCache;
private final boolean exceed134217727;
public Segments(
final Log log,
final File segmentsPath,
final int entityCacheMaxSize,
final long maxFileSize,
final boolean useTailCache,
final boolean exceed134217727) throws IOException {
this.log = log;
this.segmentsPath = segmentsPath;
this.entityCacheMaxSize = entityCacheMaxSize;
this.maxFileSize = maxFileSize;
this.useTailCache = useTailCache;
this.exceed134217727 = exceed134217727;
this.segments = new HashMap<String, Segment>();
this.process_assignment = new HashMap<Process, String>();
// assign default segment names for the processes
this.process_assignment.put(Process.RECEIPTS, "default");
this.process_assignment.put(Process.QUERIES, "default");
this.process_assignment.put(Process.DHTIN, "default");
this.process_assignment.put(Process.DHTOUT, "default");
this.process_assignment.put(Process.PROXY, "default");
this.process_assignment.put(Process.LOCALCRAWLING, "default");
this.process_assignment.put(Process.REMOTECRAWLING, "default");
this.process_assignment.put(Process.PUBLIC, "default");
}
public void setSegment(Process process, String segmentName) {
this.process_assignment.put(process, segmentName);
}
public static void migrateOld(File oldSingleSegment, File newSegmentsPath, String newSegmentName) {
if (!oldSingleSegment.exists()) return;
File newSegmentPath = new File(newSegmentsPath, newSegmentName);
if (!newSegmentPath.exists()) newSegmentPath.mkdirs();
Segment.migrateTextIndex(oldSingleSegment, newSegmentPath);
Segment.migrateTextMetadata(oldSingleSegment, newSegmentPath);
String[] oldFiles = oldSingleSegment.list();
for (String oldFile: oldFiles) {
if (oldFile.startsWith("text.")) {
new File(oldSingleSegment, oldFile).renameTo(new File(newSegmentPath, oldFile));
}
}
}
public String[] segmentNames() {
return this.segments.keySet().toArray(new String[this.segments.size()]);
}
public boolean segmentExist(final String segmentName) {
return segments.containsKey(segmentName);
}
public Segment segment(final Process process) {
return segment(this.process_assignment.get(process));
}
public Segment segment(final String segmentName) {
Segment segment = segments.get(segmentName);
if (segment == null) {
// generate the segment
try {
segment = new Segment(
this.log,
new File(this.segmentsPath, segmentName),
this.entityCacheMaxSize,
this.maxFileSize,
this.useTailCache,
this.exceed134217727);
} catch (IOException e) {
e.printStackTrace();
return null;
}
this.segments.put(segmentName, segment);
}
return segment;
}
public int URLCount() {
int c = 0;
for (Segment s: this.segments.values()) c += s.urlMetadata().size();
return c;
}
public int RWICount() {
int c = 0;
for (Segment s: this.segments.values()) c += s.termIndex().sizesMax();
return c;
}
public int RWIBufferCount() {
int c = 0;
for (Segment s: this.segments.values()) c += s.termIndex().getBufferSize();
return c;
}
public MetadataRepository urlMetadata(final Process process) {
return segment(this.process_assignment.get(process)).urlMetadata();
}
public IndexCell<WordReference> termIndex(final Process process) {
return segment(this.process_assignment.get(process)).termIndex();
}
public void clear(final Process process) {
segment(this.process_assignment.get(process)).clear();
}
public File getLocation(final Process process) {
return segment(this.process_assignment.get(process)).getLocation();
}
public void close(final Process process) {
segment(this.process_assignment.get(process)).close();
}
public void close() {
if (segments != null) for (Segment s: this.segments.values()) s.close();
this.segments = null;
}
public void finalize() {
this.close();
}
public URLMetadataRow storeDocument(
final String segmentName,
final yacyURL url,
final yacyURL referrerURL,
final Date docDate,
final long sourcesize,
final Document document,
final Condenser condenser
) throws IOException {
return segment(segmentName).storeDocument(
url,
referrerURL,
docDate,
sourcesize,
document,
condenser
);
}
public synchronized Segment.ReferenceCleaner getReferenceCleaner(final String segmentName, final byte[] startHash) throws IOException {
return segment(segmentName).getReferenceCleaner(startHash);
}
public Iterator<Segment> iterator() {
return this.segments.values().iterator();
}
}

@ -156,7 +156,7 @@ public class URLMetadataRow implements Metadata {
final int lapp) {
// create new entry
this.entry = rowdef.newEntry();
this.entry.setCol(col_hash, url.hash(), null);
this.entry.setCol(col_hash, url.hash());
this.entry.setCol(col_comp, encodeComp(url, dc_title, dc_creator, dc_subject, ETag));
encodeDate(col_mod, mod);
encodeDate(col_load, load);
@ -191,11 +191,17 @@ public class URLMetadataRow implements Metadata {
private void encodeDate(final int col, final Date d) {
// calculates the number of days since 1.1.1970 and returns this as 4-byte array
this.entry.setCol(col, NaturalOrder.encodeLong(d.getTime() / 86400000, 4));
// 86400000 is the number of milliseconds in one day
this.entry.setCol(col, NaturalOrder.encodeLong(d.getTime() / 86400000L, 4));
}
private Date decodeDate(final int col) {
return new Date(86400000 * this.entry.getColLong(col));
long t = this.entry.getColLong(col);
/*if (t < 14600) */return new Date(86400000L * t); // time was stored as number of days since epoch
/*
if (t < 350400) return new Date(3600000L * t); // hours since epoch
if (t < 21024000) return new Date(60000L * t); // minutes since epoch
*/
}
public static byte[] encodeComp(final yacyURL url, final String dc_title, final String dc_creator, final String dc_subject, final String ETag) {

@ -60,7 +60,7 @@ public class ByteArray {
return buffer[pos];
}
public static boolean equals(final byte[] buffer, final byte[] pattern) {
public static boolean startsWith(final byte[] buffer, final byte[] pattern) {
// compares two byte arrays: true, if pattern appears completely at offset position
if (buffer == null && pattern == null) return true;
if (buffer == null || pattern == null) return false;

@ -226,7 +226,7 @@ public class ResultFetcher {
registerFailure(page.hash(), "no text snippet for URL " + metadata.url());
if (!peers.mySeed().isVirgin())
try {
TextSnippet.failConsequences(snippet, query.id(false));
TextSnippet.failConsequences(this.indexSegment, snippet, query.id(false));
} catch (IOException e) {
e.printStackTrace();
}

@ -160,6 +160,7 @@ import de.anomic.http.server.RobotsTxtConfig;
import de.anomic.kelondro.order.Digest;
import de.anomic.kelondro.order.NaturalOrder;
import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.Segments;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.util.DateFormatter;
@ -231,7 +232,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
public File surrogatesInPath;
public File surrogatesOutPath;
public Map<String, String> rankingPermissions;
public Segment indexSegment;
public Segments indexSegments;
public LoaderDispatcher loader;
public CrawlSwitchboard crawler;
public CrawlQueues crawlQueues;
@ -363,9 +364,12 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
partitionExponent,
this.useTailCache,
this.exceed134217727);
indexSegment = new Segment(
File oldSingleSegment = new File(new File(indexPath, networkName), "TEXT");
File newSegmentsPath = new File(new File(indexPath, networkName), "SEGMENTS");
Segments.migrateOld(oldSingleSegment, newSegmentsPath, getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default"));
indexSegments = new Segments(
log,
new File(new File(indexPath, networkName), "TEXT"),
newSegmentsPath,
wordCacheMaxCount,
fileSizeMax,
this.useTailCache,
@ -377,8 +381,20 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
this.queuesRoot);
} catch (IOException e1) {
e1.printStackTrace();
indexSegment = null;
indexSegments = null;
}
// set the default segment names
indexSegments.setSegment(Segments.Process.RECEIPTS, getConfig(SwitchboardConstants.SEGMENT_RECEIPTS, "default"));
indexSegments.setSegment(Segments.Process.QUERIES, getConfig(SwitchboardConstants.SEGMENT_QUERIES, "default"));
indexSegments.setSegment(Segments.Process.DHTIN, getConfig(SwitchboardConstants.SEGMENT_DHTIN, "default"));
indexSegments.setSegment(Segments.Process.DHTOUT, getConfig(SwitchboardConstants.SEGMENT_DHTOUT, "default"));
indexSegments.setSegment(Segments.Process.PROXY, getConfig(SwitchboardConstants.SEGMENT_PROXY, "default"));
indexSegments.setSegment(Segments.Process.LOCALCRAWLING, getConfig(SwitchboardConstants.SEGMENT_LOCALCRAWLING, "default"));
indexSegments.setSegment(Segments.Process.REMOTECRAWLING, getConfig(SwitchboardConstants.SEGMENT_REMOTECRAWLING, "default"));
indexSegments.setSegment(Segments.Process.PUBLIC, getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default"));
// init crawl results monitor cache
crawlResults = new ResultURLs();
// start yacy core
@ -389,8 +405,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
// init a DHT transmission dispatcher
this.dhtDispatcher = new Dispatcher(
indexSegment.termIndex(),
indexSegment.urlMetadata(),
indexSegments.segment(Segments.Process.LOCALCRAWLING),
peers,
true,
30000);
@ -583,7 +598,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
this.crawlStacker = new CrawlStacker(
this.crawlQueues,
this.crawler,
this.indexSegment,
this.indexSegments.segment(Segments.Process.LOCALCRAWLING),
this.peers,
"local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0,
"global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0);
@ -794,8 +809,8 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
// switch the networks
synchronized (this) {
// shut down
synchronized (this.indexSegment) {
this.indexSegment.close();
synchronized (this.indexSegments) {
this.indexSegments.close();
}
this.crawlStacker.announceClose();
this.crawlStacker.close();
@ -832,9 +847,9 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
this.useTailCache,
this.exceed134217727);
try {
indexSegment = new Segment(
indexSegments = new Segments(
log,
new File(new File(indexPrimaryPath, networkName), "TEXT"),
new File(new File(indexPrimaryPath, networkName), "SEGMENTS"),
wordCacheMaxCount,
fileSizeMax,
this.useTailCache,
@ -868,11 +883,10 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
this.getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()),
this.getConfigLong("minimumGlobalDelta", this.crawlQueues.noticeURL.getMinimumGlobalDelta()));
// we need a new stacker, because this uses network-specific attributes to sort out urls (local, global)
this.crawlStacker = new CrawlStacker(
this.crawlQueues,
this.crawler,
this.indexSegment,
this.indexSegments.segment(Segments.Process.LOCALCRAWLING),
this.peers,
"local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0,
"global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0);
@ -1007,26 +1021,32 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
}
}
public String urlExists(final String hash) {
public String urlExists(Segments.Process process, final String hash) {
// tests if hash occurrs in any database
// if it exists, the name of the database is returned,
// if it not exists, null is returned
if (indexSegment.urlMetadata().exists(hash)) return "loaded";
if (indexSegments.urlMetadata(process).exists(hash)) return "loaded";
return this.crawlQueues.urlExists(hash);
}
public void urlRemove(final String hash) {
indexSegment.urlMetadata().remove(hash);
public void urlRemove(Segment segment, final String hash) {
segment.urlMetadata().remove(hash);
crawlResults.remove(hash);
crawlQueues.urlRemove(hash);
}
public yacyURL getURL(final String urlhash) {
public void urlRemove(Segments.Process process, final String hash) {
indexSegments.urlMetadata(process).remove(hash);
crawlResults.remove(hash);
crawlQueues.urlRemove(hash);
}
public yacyURL getURL(Segments.Process process, final String urlhash) {
if (urlhash == null) return null;
if (urlhash.length() == 0) return null;
final yacyURL ne = crawlQueues.getURL(urlhash);
if (ne != null) return ne;
final URLMetadataRow le = indexSegment.urlMetadata().load(urlhash, null, 0);
final URLMetadataRow le = indexSegments.urlMetadata(process).load(urlhash, null, 0);
if (le != null) return le.metadata().url();
return null;
}
@ -1120,7 +1140,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
crawlQueues.close();
crawler.close();
log.logConfig("SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager (stand by...)");
indexSegment.close();
indexSegments.close();
peers.close();
Cache.close();
UPnP.deletePortMapping();
@ -1187,7 +1207,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
// put document into the concurrent processing queue
if (log.isFinest()) log.logFinest("deQueue: passing to indexing queue: " + response.url().toNormalform(true, false));
try {
this.indexingDocumentProcessor.enQueue(new indexingQueueEntry(response, null, null));
this.indexingDocumentProcessor.enQueue(new indexingQueueEntry(Segments.Process.LOCALCRAWLING, response, null, null));
return null;
} catch (InterruptedException e) {
e.printStackTrace();
@ -1232,7 +1252,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
0
);
response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile);
indexingQueueEntry queueEntry = new indexingQueueEntry(response, document, null);
indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.LOCALCRAWLING, response, document, null);
// place the queue entry into the concurrent process of the condenser (document analysis)
try {
@ -1300,14 +1320,17 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
}
public static class indexingQueueEntry extends serverProcessorJob {
public Segments.Process process;
public Response queueEntry;
public Document document;
public Condenser condenser;
public indexingQueueEntry(
final Segments.Process process,
final Response queueEntry,
final Document document,
final Condenser condenser) {
super();
this.process = process;
this.queueEntry = queueEntry;
this.document = document;
this.condenser = condenser;
@ -1330,7 +1353,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
// clear caches if necessary
if (!MemoryControl.request(8000000L, false)) {
indexSegment.urlMetadata().clearCache();
for (Segment indexSegment: this.indexSegments) indexSegment.urlMetadata().clearCache();
SearchEventCache.cleanupEvents(true);
}
@ -1569,7 +1592,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
if (document == null) {
return null;
}
return new indexingQueueEntry(in.queueEntry, document, null);
return new indexingQueueEntry(in.process, in.queueEntry, document, null);
}
private Document parseDocument(Response entry) throws InterruptedException {
@ -1679,7 +1702,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
final CrawlProfile.entry profile = in.queueEntry.profile();
ResultImages.registerImages(in.document, (profile == null) ? true : !profile.remoteIndexing());
return new indexingQueueEntry(in.queueEntry, in.document, condenser);
return new indexingQueueEntry(in.process, in.queueEntry, in.document, condenser);
} catch (final UnsupportedEncodingException e) {
return null;
}
@ -1693,11 +1716,11 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
public void storeDocumentIndex(final indexingQueueEntry in) {
in.queueEntry.updateStatus(Response.QUEUE_STATE_INDEXSTORAGE);
storeDocumentIndex(in.queueEntry, in.document, in.condenser);
storeDocumentIndex(in.process, in.queueEntry, in.document, in.condenser);
in.queueEntry.updateStatus(Response.QUEUE_STATE_FINISHED);
}
private void storeDocumentIndex(final Response queueEntry, final Document document, final Condenser condenser) {
private void storeDocumentIndex(Segments.Process process, final Response queueEntry, final Document document, final Condenser condenser) {
// CREATE INDEX
final String dc_title = document.dc_title();
@ -1710,7 +1733,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
// STORE URL TO LOADED-URL-DB
URLMetadataRow newEntry = null;
try {
newEntry = indexSegment.storeDocument(
newEntry = indexSegments.segment(process).storeDocument(
queueEntry.url(),
referrerURL,
queueEntry.lastModified(),
@ -1726,10 +1749,10 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
// update url result list statistics
crawlResults.stack(
newEntry, // loaded url db entry
queueEntry.initiator(), // initiator peer hash
newEntry, // loaded url db entry
queueEntry.initiator(), // initiator peer hash
this.peers.mySeed().hash, // executor peer hash
processCase // process case
processCase // process case
);
// STORE WORD INDEX
@ -1801,11 +1824,11 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
}
// method for index deletion
public int removeAllUrlReferences(final yacyURL url, final boolean fetchOnline) {
return removeAllUrlReferences(url.hash(), fetchOnline);
public int removeAllUrlReferences(Segment indexSegment, final yacyURL url, final boolean fetchOnline) {
return removeAllUrlReferences(indexSegment, url.hash(), fetchOnline);
}
public int removeAllUrlReferences(final String urlhash, final boolean fetchOnline) {
public int removeAllUrlReferences(Segment indexSegment, final String urlhash, final boolean fetchOnline) {
// find all the words in a specific resource and remove the url reference from every word index
// finally, delete the url entry
@ -1937,7 +1960,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
return accessSet.tailSet(Long.valueOf(System.currentTimeMillis() - timeInterval)).size();
}
public String dhtShallTransfer() {
public String dhtShallTransfer(String segment) {
String cautionCause = onlineCaution();
if (cautionCause != null) {
return "online caution for " + cautionCause + ", dht transmission";
@ -1960,6 +1983,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
if (getConfig(SwitchboardConstants.INDEX_DIST_ALLOW, "false").equalsIgnoreCase("false")) {
return "no DHT distribution: not enabled (per setting)";
}
Segment indexSegment = this.indexSegments.segment(segment);
if (indexSegment.urlMetadata().size() < 10) {
return "no DHT distribution: loadedURL.size() = " + indexSegment.urlMetadata().size();
}
@ -1974,9 +1998,13 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
}
return null; // this means; yes, please do dht transfer
}
public boolean dhtTransferJob() {
final String rejectReason = dhtShallTransfer();
return dhtTransferJob(getConfig(SwitchboardConstants.SEGMENT_DHTOUT, "default"));
}
public boolean dhtTransferJob(String segment) {
final String rejectReason = dhtShallTransfer(segment);
if (rejectReason != null) {
if (this.log.isFine()) log.logFine(rejectReason);
return false;
@ -2073,10 +2101,10 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
peers.mySeed().put(yacySeed.RSPEED, Double.toString(totalQPM /*Math.max((float) requestcdiff, 0f) * 60f / Math.max((float) uptimediff, 1f)*/ ));
peers.mySeed().put(yacySeed.UPTIME, Long.toString(uptime/60)); // the number of minutes that the peer is up in minutes/day (moving average MA30)
peers.mySeed().put(yacySeed.LCOUNT, Integer.toString(indexSegment.urlMetadata().size())); // the number of links that the peer has stored (LURL's)
peers.mySeed().put(yacySeed.LCOUNT, Integer.toString(indexSegments.URLCount())); // the number of links that the peer has stored (LURL's)
peers.mySeed().put(yacySeed.NCOUNT, Integer.toString(crawlQueues.noticeURL.size())); // the number of links that the peer has noticed, but not loaded (NURL's)
peers.mySeed().put(yacySeed.RCOUNT, Integer.toString(crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT))); // the number of links that the peer provides for remote crawling (ZURL's)
peers.mySeed().put(yacySeed.ICOUNT, Integer.toString(indexSegment.termIndex().sizesMax())); // the minimum number of words that the peer has indexed (as it says)
peers.mySeed().put(yacySeed.ICOUNT, Integer.toString(indexSegments.RWICount())); // the minimum number of words that the peer has indexed (as it says)
peers.mySeed().put(yacySeed.SCOUNT, Integer.toString(peers.sizeConnected())); // the number of seeds that the peer has stored
peers.mySeed().put(yacySeed.CCOUNT, Double.toString(((int) ((peers.sizeConnected() + peers.sizeDisconnected() + peers.sizePotential()) * 60.0 / (uptime + 1.01)) * 100) / 100.0)); // the number of clients that the peer connects (as connects/hour)
peers.mySeed().put(yacySeed.VERSION, yacyBuildProperties.getLongVersion());

@ -437,4 +437,16 @@ public final class SwitchboardConstants {
public static final String TRAY_ICON_FORCED = "trayIcon.force";
public static final String TRAY_LABEL = "tray.label";
public static final String BROWSERINTEGRATION = "browserintegration";
/**
* Segments
*/
public static final String SEGMENT_RECEIPTS = "segment.process.receipts_tmp";
public static final String SEGMENT_QUERIES = "segment.process.queries_tmp";
public static final String SEGMENT_DHTIN = "segment.process.dhtin_tmp";
public static final String SEGMENT_DHTOUT = "segment.process.dhtout_tmp";
public static final String SEGMENT_PROXY = "segment.process.proxy_tmp";
public static final String SEGMENT_LOCALCRAWLING = "segment.process.localcrawling_tmp";
public static final String SEGMENT_REMOTECRAWLING= "segment.process.remotecrawling_tmp";
public static final String SEGMENT_PUBLIC = "segment.process.public_tmp";
}

@ -46,6 +46,7 @@ import de.anomic.http.metadata.ResponseHeader;
import de.anomic.kelondro.index.ARC;
import de.anomic.kelondro.index.ConcurrentARC;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.SetTools;
import de.anomic.yacy.yacySearch;
@ -571,7 +572,7 @@ public class TextSnippet {
}
}
public static String failConsequences(final TextSnippet snippet, final String eventID) throws IOException {
public static String failConsequences(Segment indexSegment, final TextSnippet snippet, final String eventID) throws IOException {
// problems with snippet fetch
final String urlHash = snippet.getUrl().hash();
final String querystring = SetTools.setToString(snippet.getRemainingHashes(), ' ');
@ -580,18 +581,17 @@ public class TextSnippet {
(snippet.getErrorCode() == ERROR_PARSER_FAILED) ||
(snippet.getErrorCode() == ERROR_PARSER_NO_LINES)) {
log.logInfo("error: '" + snippet.getError() + "', remove url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError());
Switchboard.getSwitchboard().indexSegment.urlMetadata().remove(urlHash);
indexSegment.urlMetadata().remove(urlHash);
final SearchEvent event = SearchEventCache.getEvent(eventID);
assert Switchboard.getSwitchboard() != null;
assert Switchboard.getSwitchboard().indexSegment != null;
assert indexSegment != null;
assert event != null : "eventID = " + eventID;
assert event.getQuery() != null;
Switchboard.getSwitchboard().indexSegment.termIndex().remove(event.getQuery().queryHashes, urlHash);
indexSegment.termIndex().remove(event.getQuery().queryHashes, urlHash);
event.remove(urlHash);
}
if (snippet.getErrorCode() == ERROR_NO_MATCH) {
log.logInfo("error: '" + snippet.getError() + "', remove words '" + querystring + "' for url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError());
Switchboard.getSwitchboard().indexSegment.termIndex().remove(snippet.getRemainingHashes(), urlHash);
indexSegment.termIndex().remove(snippet.getRemainingHashes(), urlHash);
SearchEventCache.getEvent(eventID).remove(urlHash);
}
return snippet.getError();

@ -256,6 +256,7 @@ public class CRProcess {
newacc = new Table(new File(path, CRG_accname), CRG_accrow, 0, 0, true, false);
newseq = new IndexCell<WordReference>(
path,
"index",
Segment.wordReferenceFactory,
Base64Order.enhancedCoder,
CRG_colrow,
@ -390,9 +391,9 @@ public class CRProcess {
public static int genrcix(final File cr_path_in, final File rci_path_out) throws IOException {
//kelondroFlexTable acc = new kelondroFlexTable(cr_path_in, CRG_accname, kelondroBase64Order.enhancedCoder, 128 * 1024 * 1024, -1, CRG_accrow, true);
final IndexCell<WordReference> seq = new IndexCell<WordReference>(
cr_path_in, Segment.wordReferenceFactory, Base64Order.enhancedCoder, CRG_colrow, 10000, 1000000000L, 20, null, 1000000);
cr_path_in, "index", Segment.wordReferenceFactory, Base64Order.enhancedCoder, CRG_colrow, 10000, 1000000000L, 20, null, 1000000);
final IndexCell<WordReference> rci = new IndexCell<WordReference>(
rci_path_out, Segment.wordReferenceFactory, Base64Order.enhancedCoder, RCI_coli, 10000, 1000000000L, 20, null, 1000000);
rci_path_out, "index", Segment.wordReferenceFactory, Base64Order.enhancedCoder, RCI_coli, 10000, 1000000000L, 20, null, 1000000);
// loop over all referees
int count = 0;

@ -33,9 +33,7 @@ import java.util.LinkedHashMap;
import java.util.Map;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.text.BufferedIndex;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.MetadataRepository;
import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.referencePrototype.WordReference;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
@ -83,8 +81,8 @@ public class Dispatcher {
// the String-key is the primary target as contained in the Entry
private Map<ByteArray, Transmission.Chunk> transmissionCloud;
// the backend is used to store the remaining indexContainers in case that the object is closed
private BufferedIndex<WordReference> backend;
// the segment backend is used to store the remaining indexContainers in case that the object is closed
private Segment segment;
// the seed database
private yacySeedDB seeds;
@ -99,21 +97,19 @@ public class Dispatcher {
private Transmission transmission;
public Dispatcher(
final BufferedIndex<WordReference> backend,
final MetadataRepository repository,
final Segment segment,
final yacySeedDB seeds,
final boolean gzipBody,
final int timeout
) {
this.transmissionCloud = new LinkedHashMap<ByteArray, Transmission.Chunk>();
this.backend = backend;
this.segment = segment;
this.seeds = seeds;
this.log = new Log("INDEX-TRANSFER-DISPATCHER");
this.transmission = new Transmission(
log,
repository,
segment,
seeds,
backend,
gzipBody,
timeout);
//this.selectedContainerCache = null;
@ -171,7 +167,7 @@ public class Dispatcher {
final ArrayList<ReferenceContainer<WordReference>> containers = new ArrayList<ReferenceContainer<WordReference>>(maxContainerCount);
final Iterator<ReferenceContainer<WordReference>> indexContainerIterator = this.backend.references(hash, true, ram);
final Iterator<ReferenceContainer<WordReference>> indexContainerIterator = this.segment.termIndex().references(hash, true, ram);
ReferenceContainer<WordReference> container;
int refcount = 0;
@ -204,7 +200,7 @@ public class Dispatcher {
urlHashes.add(it.next().metadataHash());
}
if (this.log.isFine()) this.log.logFine("selected " + urlHashes.size() + " urls for word '" + c.getTermHashAsString() + "'");
if (urlHashes.size() > 0) this.backend.remove(c.getTermHash(), urlHashes);
if (urlHashes.size() > 0) this.segment.termIndex().remove(c.getTermHash(), urlHashes);
}
rc = containers;
} else {
@ -212,7 +208,7 @@ public class Dispatcher {
// but to avoid race conditions return the results from the deletes
rc = new ArrayList<ReferenceContainer<WordReference>>(containers.size());
for (ReferenceContainer<WordReference> c: containers) {
container = this.backend.delete(c.getTermHash());
container = this.segment.termIndex().delete(c.getTermHash());
if (this.log.isFine()) this.log.logFine("selected " + container.size() + " urls for word '" + c.getTermHashAsString() + "'");
if (container.size() != 0) rc.add(container);
}
@ -405,7 +401,7 @@ public class Dispatcher {
if (indexingTransmissionProcessor != null) this.indexingTransmissionProcessor.announceShutdown();
if (this.transmissionCloud != null) {
for (Map.Entry<ByteArray, Transmission.Chunk> e : this.transmissionCloud.entrySet()) {
for (ReferenceContainer<WordReference> i : e.getValue()) try {this.backend.add(i);} catch (IOException e1) {}
for (ReferenceContainer<WordReference> i : e.getValue()) try {this.segment.termIndex().add(i);} catch (IOException e1) {}
}
this.transmissionCloud.clear();
}

@ -31,10 +31,8 @@ import java.util.HashSet;
import java.util.Iterator;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.text.Index;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceContainerCache;
import de.anomic.kelondro.text.MetadataRepository;
import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.text.referencePrototype.WordReference;
@ -47,23 +45,20 @@ import de.anomic.yacy.logging.Log;
public class Transmission {
protected Log log;
protected MetadataRepository repository;
protected Segment segment;
protected yacySeedDB seeds;
protected boolean gzipBody4Transfer;
protected int timeout4Transfer;
protected Index<WordReference> backend;
public Transmission(
Log log,
MetadataRepository repository,
Segment segment,
yacySeedDB seeds,
Index<WordReference> backend,
boolean gzipBody4Transfer,
int timeout4Transfer) {
this.log = log;
this.repository = repository;
this.segment = segment;
this.seeds = seeds;
this.backend = backend;
this.gzipBody4Transfer = gzipBody4Transfer;
this.timeout4Transfer = timeout4Transfer;
}
@ -131,7 +126,7 @@ public class Transmission {
notFound.add(e.metadataHash());
continue;
}
URLMetadataRow r = repository.load(e.metadataHash(), null, 0);
URLMetadataRow r = segment.urlMetadata().load(e.metadataHash(), null, 0);
if (r == null) {
notFound.add(e.metadataHash());
badReferences.add(e.metadataHash());
@ -251,7 +246,7 @@ public class Transmission {
}
public void restore() {
for (ReferenceContainer<WordReference> ic : this) try { backend.add(ic); } catch (IOException e) {}
for (ReferenceContainer<WordReference> ic : this) try { segment.termIndex().add(ic); } catch (IOException e) {}
}
}
}

@ -649,10 +649,10 @@ public final class yacy {
log.logInfo("STARTING URL CLEANUP");
// db containing all currently loades urls
final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexPrimaryRoot, networkName), "TEXT"), false, false);
final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexPrimaryRoot, networkName), "TEXT"), "text.urlmd", false, false);
// db used to hold all neede urls
final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(new File(indexRoot2, networkName), "TEXT"), false, false);
final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(new File(indexRoot2, networkName), "TEXT"), "text.urlmd", false, false);
final int cacheMem = (int)(MemoryControl.maxMemory - MemoryControl.total());
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
@ -835,7 +835,7 @@ public final class yacy {
final File root = homePath;
final File indexroot = new File(root, "DATA/INDEX");
try {Log.configureLogging(homePath, new File(homePath, "DATA/LOG/yacy.logging"));} catch (final Exception e) {}
final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexroot, networkName), "TEXT"), false, false);
final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexroot, networkName), "TEXT"), "text.urlmd", false, false);
currentUrlDB.deadlinkCleaner(null);
currentUrlDB.close();
}

Loading…
Cancel
Save