- added a delete button in host browser to delete a complete subpath

- removed storage of default collection name - default is now "user"
- made stacking of crawl start points concurrently
pull/1/head
Michael Peter Christen 12 years ago
parent 0716a24737
commit f8f05ecba7

@ -85,7 +85,7 @@ public class CrawlStartExpert_p {
boolean collectionEnabled = sb.index.fulltext().getSolrScheme().isEmpty() || sb.index.fulltext().getSolrScheme().contains(YaCySchema.collection_sxt);
prop.put("collectionEnabled", collectionEnabled ? 1 : 0);
prop.put("collection", collectionEnabled ? sb.getConfig("collection", "user") : "");
prop.put("collection", collectionEnabled ? "user" : "");
// return rewrite properties
return prop;

@ -43,13 +43,8 @@ import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.ZURL.FailCategory;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.SitemapImporter;
import net.yacy.data.BookmarkHelper;
import net.yacy.data.BookmarksDB;
import net.yacy.data.ListManager;
import net.yacy.data.WorkTables;
import net.yacy.data.ymark.YMarkTables;
import net.yacy.document.Document;
import net.yacy.document.Parser.Failure;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.meta.DigestURI;
@ -212,7 +207,7 @@ public class Crawler_p {
boolean directDocByURL = "on".equals(post.get("directDocByURL", "off")); // catch also all linked media documents without loading them
env.setConfig("crawlingDirectDocByURL", directDocByURL);
final String collection = post.get("collection", sb.getConfig("collection", "user"));
final String collection = post.get("collection", "user");
env.setConfig("collection", collection);
// recrawl
@ -376,13 +371,10 @@ public class Crawler_p {
// stack requests
sb.crawler.putActive(handle, profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
Set<DigestURI> successurls = new HashSet<DigestURI>();
Map<DigestURI,String> failurls = new HashMap<DigestURI, String>();
String failreason;
for (DigestURI url: rootURLs) {
if ((failreason = stackUrl(sb, profile, url)) == null) successurls.add(url); else failurls.put(url, failreason);
}
final Set<DigestURI> successurls = new HashSet<DigestURI>();
final Map<DigestURI,String> failurls = new HashMap<DigestURI, String>();
sb.stackURLs(rootURLs, profile, successurls, failurls);
if (failurls.size() == 0) {
// liftoff!
prop.put("info", "8");
@ -552,106 +544,6 @@ public class Crawler_p {
return prop;
}
/**
* stack the url to the crawler
* @param sb
* @param profile
* @param url
* @return null if this was ok. If this failed, return a string with a fail reason
*/
private static String stackUrl(Switchboard sb, CrawlProfile profile, DigestURI url) {
byte[] handle = ASCII.getBytes(profile.handle());
// remove url from the index to be prepared for a re-crawl
final byte[] urlhash = url.hash();
sb.index.fulltext().remove(urlhash);
sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
sb.crawlQueues.errorURL.remove(urlhash);
// special handling of ftp protocol
if (url.isFTP()) {
try {
sb.crawler.putActive(handle, profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
sb.crawlStacker.enqueueEntriesFTP(sb.peers.mySeed().hash.getBytes(), profile.handle(), url.getHost(), url.getPort(), false);
return null;
} catch (final Exception e) {
// mist
Log.logException(e);
return "problem crawling an ftp site: " + e.getMessage();
}
}
// get a scraper to get the title
Document scraper;
try {
scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay);
} catch (IOException e) {
Log.logException(e);
return "scraper cannot load URL: " + e.getMessage();
}
final String title = scraper == null ? url.toNormalform(true) : scraper.dc_title();
final String description = scraper.dc_description();
// add the url to the crawl stack
sb.crawler.removePassive(handle); // if there is an old entry, delete it
sb.crawler.putActive(handle, profile);
final String reasonString = sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(),
url,
null,
"CRAWLING-ROOT",
new Date(),
profile.handle(),
0,
0,
0,
0
));
if (reasonString != null) return reasonString;
// create a bookmark from crawl start url
//final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
tags.add("crawlStart");
final String[] keywords = scraper.dc_subject();
if (keywords != null) {
for (final String k: keywords) {
final String kk = BookmarkHelper.cleanTagsString(k);
if (kk.length() > 0) tags.add(kk);
}
}
String tagStr = tags.toString();
if (tagStr.length() > 2 && tagStr.startsWith("[") && tagStr.endsWith("]")) tagStr = tagStr.substring(1, tagStr.length() - 2);
// we will create always a bookmark to use this to track crawled hosts
final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(url.toNormalform(true), "admin");
if (bookmark != null) {
bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, title);
bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_DESCRIPTION, description);
bookmark.setOwner("admin");
bookmark.setPublic(false);
bookmark.setTags(tags, true);
sb.bookmarksDB.saveBookmark(bookmark);
}
// do the same for ymarks
// TODO: could a non admin user add crawls?
try {
sb.tables.bookmarks.createBookmark(sb.loader, url, YMarkTables.USER_ADMIN, true, "crawlStart", "/Crawl Start");
} catch (IOException e) {
Log.logException(e);
} catch (Failure e) {
Log.logException(e);
}
// that was ok
return null;
}
private static long recrawlIfOlderC(final boolean recrawlIfOlderCheck, final int recrawlIfOlderNumber, final String crawlingIfOlderUnit) {
if (!recrawlIfOlderCheck) return 0L;
if ("year".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 365L;

@ -64,7 +64,9 @@ function updatepage(str) {
<fieldset class="yacys">
Host/URL:
<input id="search" type="text" name="path" value="#[path]#" size="40" maxlength="250" />
<input type="submit" name="list" value="Browse Host" class="submitready" style="width:240px;"/><br />
<input type="submit" name="list" value="Browse Host" class="submitready" style="width:240px;"/>
#(delete)#::<input type="submit" name="delete" value="Delete Subpath" class="submitready" style="width:240px;" onclick="return confirm('Confirm Deletion')"/>#(/delete)#
<br />
<div id="searchresults"></div>
</fieldset>
</form>

@ -20,10 +20,12 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
@ -91,6 +93,8 @@ public class HostBrowser {
!path.startsWith("smb://") &&
!path.startsWith("file://"))) { path = "http://" + path; }
prop.putHTML("path", path);
prop.put("delete", admin && path.length() > 0 ? 1 : 0);
DigestURI pathURI = null;
try {pathURI = new DigestURI(path);} catch (MalformedURLException e) {}
@ -145,6 +149,12 @@ public class HostBrowser {
}
if (path.length() > 0) {
boolean delete = false;
if (admin && post.containsKey("delete")) {
// delete the complete path!! That includes everything that matches with this prefix.
delete = true;
}
boolean complete = post.getBoolean("complete");
if (complete) { // we want only root paths for complete lists
p = path.indexOf('/', 10);
@ -174,10 +184,19 @@ public class HostBrowser {
Set<String> inboundLinks = new HashSet<String>();
Map<String, ReversibleScoreMap<String>> outboundHosts = new HashMap<String, ReversibleScoreMap<String>>();
int hostsize = 0;
final List<byte[]> deleteIDs = new ArrayList<byte[]>();
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
hostsize++;
if (complete || u.startsWith(path)) storedDocs.add(u);
if (u.startsWith(path)) {
if (delete) {
deleteIDs.add(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.name())));
} else {
storedDocs.add(u);
}
} else if (complete) {
storedDocs.add(u);
}
// collect inboundlinks to browse the host
Iterator<String> links = URIMetadataNode.getLinks(doc, true);
while (links.hasNext()) {
@ -202,6 +221,7 @@ public class HostBrowser {
} catch (MalformedURLException e) {}
}
}
if (deleteIDs.size() > 0) sb.index.fulltext().removeConcurrently(deleteIDs);
// now combine both lists into one
Map<String, Boolean> files = new HashMap<String, Boolean>();

@ -396,6 +396,10 @@ public final class FileUtils {
return mb;
}
private final static Pattern ps = Pattern.compile("\\\\");
private final static Pattern pn = Pattern.compile("\\n");
private final static Pattern pe = Pattern.compile("=");
public static void saveMap(final File file, final Map<String, String> props, final String comment) {
PrintWriter pw = null;
final File tf = new File(file.toString() + "." + (System.currentTimeMillis() % 1000));
@ -406,12 +410,16 @@ public final class FileUtils {
for ( final Map.Entry<String, String> entry : props.entrySet() ) {
key = entry.getKey();
if ( key != null ) {
key = key.replace("\\", "\\\\").replace("\n", "\\n").replace("=", "\\=");
key = ps.matcher(key).replaceAll("\\\\");
key = pn.matcher(key).replaceAll("\\n");
key = pe.matcher(key).replaceAll("\\=");
}
if ( entry.getValue() == null ) {
value = "";
} else {
value = entry.getValue().replace("\\", "\\\\").replace("\n", "\\n");
value = entry.getValue();
value = ps.matcher(value).replaceAll("\\\\");
value = pn.matcher(value).replaceAll("\\n");
}
pw.println(key + "=" + value);
}
@ -432,7 +440,7 @@ public final class FileUtils {
// ignore
}
}
public static void saveMapB(final File file, final Map<String, byte[]> props, final String comment) {
HashMap<String, String> m = new HashMap<String, String>();
for (Map.Entry<String, byte[]> e: props.entrySet()) m.put(e.getKey(), UTF8.String(e.getValue()));

@ -1061,7 +1061,7 @@ public final class Protocol
// evaluate result
List<URIMetadataNode> container = new ArrayList<URIMetadataNode>();
if (docList.size() > 0) {// create containers
Network.log.logInfo("SEARCH (solr), returned " + docList.size() + " documents from " + (target == null ? "shard" : ("peer " + target.hash + ":" + target.getName()))) ;
Network.log.logInfo("SEARCH (solr), returned " + docList.size() + " out of " + docList.getNumFound() + " documents from " + (target == null ? "shard" : ("peer " + target.hash + ":" + target.getName()))) ;
int term = count;
for (final SolrDocument doc: docList) {

@ -293,7 +293,7 @@ public class RemoteSearch extends Thread {
}
}
};
if (targetPeer == null) solr.run(); else solr.start();
/*if (targetPeer == null) solr.run(); else*/ solr.start();
return solr;
}

@ -60,6 +60,7 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
@ -124,6 +125,7 @@ import net.yacy.crawler.retrieval.Response;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.data.BlogBoard;
import net.yacy.data.BlogBoardComments;
import net.yacy.data.BookmarkHelper;
import net.yacy.data.BookmarksDB;
import net.yacy.data.ListManager;
import net.yacy.data.MessageBoard;
@ -133,11 +135,13 @@ import net.yacy.data.WorkTables;
import net.yacy.data.wiki.WikiBoard;
import net.yacy.data.wiki.WikiCode;
import net.yacy.data.wiki.WikiParser;
import net.yacy.data.ymark.YMarkTables;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.LibraryProvider;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.Parser.Failure;
import net.yacy.document.content.DCEntry;
import net.yacy.document.content.SurrogateReader;
import net.yacy.document.importer.OAIListFriendsLoader;
@ -195,8 +199,7 @@ import net.yacy.utils.crypt;
import com.google.common.io.Files;
public final class Switchboard extends serverSwitch
{
public final class Switchboard extends serverSwitch {
// load slots
public static int xstackCrawlSlots = 2000;
@ -269,18 +272,12 @@ public final class Switchboard extends serverSwitch
private final Semaphore shutdownSync = new Semaphore(0);
private boolean terminate = false;
//private Object crawlingPausedSync = new Object();
//private boolean crawlingIsPaused = false;
private static Switchboard sb;
public HashMap<String, Object[]> crawlJobsStatus = new HashMap<String, Object[]>();
private static Switchboard sb = null;
public Switchboard(final File dataPath, final File appPath, final String initPath, final String configPath)
throws IOException {
public Switchboard(final File dataPath, final File appPath, final String initPath, final String configPath) throws IOException {
super(dataPath, appPath, initPath, configPath);
sb = this;
// check if port is already occupied
final int port = getConfigInt("port", 8090);
try {
@ -294,7 +291,6 @@ public final class Switchboard extends serverSwitch
}
MemoryTracker.startSystemProfiling();
sb = this;
// set loglevel and log
setLog(new Log("SWITCHBOARD"));
@ -374,9 +370,9 @@ public final class Switchboard extends serverSwitch
// start indexing management
this.log.logConfig("Starting Indexing Management");
final String networkName = getConfig(SwitchboardConstants.NETWORK_NAME, "");
final long fileSizeMax = (OS.isWindows) ? sb.getConfigLong("filesize.max.win", Integer.MAX_VALUE) : sb.getConfigLong( "filesize.max.other", Integer.MAX_VALUE);
final int redundancy = (int) sb.getConfigLong("network.unit.dhtredundancy.senior", 1);
final int partitionExponent = (int) sb.getConfigLong("network.unit.dht.partitionExponent", 0);
final long fileSizeMax = (OS.isWindows) ? this.getConfigLong("filesize.max.win", Integer.MAX_VALUE) : this.getConfigLong( "filesize.max.other", Integer.MAX_VALUE);
final int redundancy = (int) this.getConfigLong("network.unit.dhtredundancy.senior", 1);
final int partitionExponent = (int) this.getConfigLong("network.unit.dht.partitionExponent", 0);
this.networkRoot = new File(new File(indexPath, networkName), "NETWORK");
this.queuesRoot = new File(new File(indexPath, networkName), "QUEUES");
this.networkRoot.mkdirs();
@ -1022,7 +1018,7 @@ public final class Switchboard extends serverSwitch
"this is the content control import thread",
null,
new InstantBusyThread(
new ContentControlImportThread(sb),
new ContentControlImportThread(this),
"run",
SwitchboardConstants.PEER_PING_METHOD_JOBCOUNT,
SwitchboardConstants.PEER_PING_METHOD_FREEMEM,
@ -1037,7 +1033,7 @@ public final class Switchboard extends serverSwitch
"this is the content control filter update thread",
null,
new InstantBusyThread(
new ContentControlFilterUpdateThread(sb),
new ContentControlFilterUpdateThread(this),
"run",
SwitchboardConstants.PEER_PING_METHOD_JOBCOUNT,
SwitchboardConstants.PEER_PING_METHOD_FREEMEM,
@ -1063,7 +1059,6 @@ public final class Switchboard extends serverSwitch
this.trail = new LinkedBlockingQueue<String>();
this.log.logConfig("Finished Switchboard Initialization");
sb = this;
}
public int getIndexingProcessorsQueueSize() {
@ -1235,10 +1230,9 @@ public final class Switchboard extends serverSwitch
final int wordCacheMaxCount =
(int) getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 20000);
final long fileSizeMax =
(OS.isWindows) ? sb.getConfigLong("filesize.max.win", Integer.MAX_VALUE) : sb
.getConfigLong("filesize.max.other", Integer.MAX_VALUE);
final int redundancy = (int) sb.getConfigLong("network.unit.dhtredundancy.senior", 1);
final int partitionExponent = (int) sb.getConfigLong("network.unit.dht.partitionExponent", 0);
(OS.isWindows) ? this.getConfigLong("filesize.max.win", Integer.MAX_VALUE) : this.getConfigLong("filesize.max.other", Integer.MAX_VALUE);
final int redundancy = (int) this.getConfigLong("network.unit.dhtredundancy.senior", 1);
final int partitionExponent = (int) this.getConfigLong("network.unit.dht.partitionExponent", 0);
final String networkName = getConfig(SwitchboardConstants.NETWORK_NAME, "");
this.networkRoot = new File(new File(indexPrimaryPath, networkName), "NETWORK");
this.queuesRoot = new File(new File(indexPrimaryPath, networkName), "QUEUES");
@ -1543,7 +1537,7 @@ public final class Switchboard extends serverSwitch
public RankingProfile getRanking() {
return (getConfig("rankingProfile", "").isEmpty())
? new RankingProfile(Classification.ContentDomain.TEXT)
: new RankingProfile("", crypt.simpleDecode(sb.getConfig("rankingProfile", "")));
: new RankingProfile("", crypt.simpleDecode(this.getConfig("rankingProfile", "")));
}
/**
@ -1970,7 +1964,7 @@ public final class Switchboard extends serverSwitch
// clear caches if necessary
if ( !MemoryControl.request(8000000L, false) ) {
sb.index.fulltext().clearCache();
this.index.fulltext().clearCache();
SearchEventCache.cleanupEvents(false);
this.trail.clear();
}
@ -2246,7 +2240,7 @@ public final class Switchboard extends serverSwitch
this.clusterhashes = this.peers.clusterHashes(getConfig("cluster.peers.yacydomain", ""));
// check if we are reachable and try to map port again if not (e.g. when router rebooted)
if ( getConfigBool(SwitchboardConstants.UPNP_ENABLED, false) && sb.peers.mySeed().isJunior() ) {
if ( getConfigBool(SwitchboardConstants.UPNP_ENABLED, false) && this.peers.mySeed().isJunior() ) {
UPnP.addPortMapping();
}
@ -2698,6 +2692,122 @@ public final class Switchboard extends serverSwitch
}
}
public void stackURLs(Set<DigestURI> rootURLs, final CrawlProfile profile, final Set<DigestURI> successurls, final Map<DigestURI,String> failurls) {
List<Thread> stackthreads = new ArrayList<Thread>(); // do this concurrently
for (DigestURI url: rootURLs) {
final DigestURI turl = url;
Thread t = new Thread() {
public void run() {
String failreason;
if ((failreason = Switchboard.this.stackUrl(profile, turl)) == null) successurls.add(turl); else failurls.put(turl, failreason);
}
};
t.start();
stackthreads.add(t);
}
for (Thread t: stackthreads)try {t.join(5000);} catch (InterruptedException e) {}
}
/**
* stack the url to the crawler
* @param profile
* @param url
* @return null if this was ok. If this failed, return a string with a fail reason
*/
public String stackUrl(CrawlProfile profile, DigestURI url) {
byte[] handle = ASCII.getBytes(profile.handle());
// remove url from the index to be prepared for a re-crawl
final byte[] urlhash = url.hash();
this.index.fulltext().remove(urlhash);
this.crawlQueues.noticeURL.removeByURLHash(urlhash);
this.crawlQueues.errorURL.remove(urlhash);
// special handling of ftp protocol
if (url.isFTP()) {
try {
this.crawler.putActive(handle, profile);
this.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
this.crawlStacker.enqueueEntriesFTP(this.peers.mySeed().hash.getBytes(), profile.handle(), url.getHost(), url.getPort(), false);
return null;
} catch (final Exception e) {
// mist
Log.logException(e);
return "problem crawling an ftp site: " + e.getMessage();
}
}
// get a scraper to get the title
Document scraper;
try {
scraper = this.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay);
} catch (IOException e) {
Log.logException(e);
return "scraper cannot load URL: " + e.getMessage();
}
final String title = scraper == null ? url.toNormalform(true) : scraper.dc_title();
final String description = scraper.dc_description();
// add the url to the crawl stack
this.crawler.removePassive(handle); // if there is an old entry, delete it
this.crawler.putActive(handle, profile);
final String reasonString = this.crawlStacker.stackCrawl(new Request(
this.peers.mySeed().hash.getBytes(),
url,
null,
"CRAWLING-ROOT",
new Date(),
profile.handle(),
0,
0,
0,
0
));
if (reasonString != null) return reasonString;
// create a bookmark from crawl start url
//final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
tags.add("crawlStart");
final String[] keywords = scraper.dc_subject();
if (keywords != null) {
for (final String k: keywords) {
final String kk = BookmarkHelper.cleanTagsString(k);
if (kk.length() > 0) tags.add(kk);
}
}
String tagStr = tags.toString();
if (tagStr.length() > 2 && tagStr.startsWith("[") && tagStr.endsWith("]")) tagStr = tagStr.substring(1, tagStr.length() - 2);
// we will create always a bookmark to use this to track crawled hosts
final BookmarksDB.Bookmark bookmark = this.bookmarksDB.createBookmark(url.toNormalform(true), "admin");
if (bookmark != null) {
bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, title);
bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_DESCRIPTION, description);
bookmark.setOwner("admin");
bookmark.setPublic(false);
bookmark.setTags(tags, true);
this.bookmarksDB.saveBookmark(bookmark);
}
// do the same for ymarks
// TODO: could a non admin user add crawls?
try {
this.tables.bookmarks.createBookmark(this.loader, url, YMarkTables.USER_ADMIN, true, "crawlStart", "/Crawl Start");
} catch (IOException e) {
Log.logException(e);
} catch (Failure e) {
Log.logException(e);
}
// that was ok
return null;
}
/**
* load the content of a URL, parse the content and add the content to the index This process is started
* concurrently. The method returns immediately after the call.
@ -2718,7 +2828,7 @@ public final class Switchboard extends serverSwitch
return; // don't do double-work
}
final Request request = this.loader.request(url, true, true);
final CrawlProfile profile = sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
final String urls = url.toNormalform(true);
if ( acceptedError != null ) {
@ -2793,7 +2903,7 @@ public final class Switchboard extends serverSwitch
return; // don't do double-work
}
final Request request = this.loader.request(url, true, true);
final CrawlProfile profile = sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
if (acceptedError != null) {
this.log.logInfo("addToCrawler: cannot load "
@ -2804,9 +2914,9 @@ public final class Switchboard extends serverSwitch
}
final String s;
if (asglobal) {
s = sb.crawlQueues.noticeURL.push(StackType.GLOBAL, request, this.robots);
s = this.crawlQueues.noticeURL.push(StackType.GLOBAL, request, this.robots);
} else {
s = sb.crawlQueues.noticeURL.push(StackType.LOCAL, request, this.robots);
s = this.crawlQueues.noticeURL.push(StackType.LOCAL, request, this.robots);
}
if (s != null) {
@ -3179,7 +3289,7 @@ public final class Switchboard extends serverSwitch
if (links != null) {
if (links.size() < 1000) { // limit to 1000 to skip large index pages
final Iterator<MultiProtocolURI> i = links.keySet().iterator();
final boolean globalcrawljob = sb.getConfigBool("heuristic.searchresults.crawlglobal",false);
final boolean globalcrawljob = Switchboard.this.getConfigBool("heuristic.searchresults.crawlglobal",false);
while (i.hasNext()) {
url = DigestURI.toDigestURI(i.next());
boolean islocal = url.getHost().contentEquals(startUrl.getHost());
@ -3239,7 +3349,7 @@ public final class Switchboard extends serverSwitch
searchEvent.getRankingResult().oneFeederStarted();
try {
final Response response =
sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay);
Switchboard.this.loader.load(Switchboard.this.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay);
final byte[] resource = (response == null) ? null : response.getContent();
//System.out.println("BLEKKO: " + UTF8.String(resource));
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
@ -3337,7 +3447,7 @@ public final class Switchboard extends serverSwitch
if ( Thread.currentThread().isInterrupted() ) {
break;
}
seedListFileURL = sb.getConfig("network.unit.bootstrap.seedlist" + c, "");
seedListFileURL = this.getConfig("network.unit.bootstrap.seedlist" + c, "");
if ( seedListFileURL.isEmpty() ) {
break;
}

@ -295,6 +295,13 @@ public final class Fulltext implements Iterable<byte[]> {
if (MemoryControl.shortStatus()) clearCache();
}
public void removeConcurrently(final List<byte[]> deleteIDs) {
new Thread() {
public void run() {for (byte[] id: deleteIDs) {remove(id);}}
}.start();
this.solr.commit();
}
public boolean remove(final byte[] urlHash) {
if (urlHash == null) return false;
try {
@ -720,7 +727,7 @@ public final class Fulltext implements Iterable<byte[]> {
}
/**
* using a fragment of the url hash (5 bytes: bytes 6 to 10) it is possible to address all urls from a specific domain
* using a fragment of the url hash (6 bytes: bytes 6 to 11) it is possible to address all urls from a specific domain
* here such a fragment can be used to delete all these domains at once
* @param hosthash
* @return number of deleted domains

Loading…
Cancel
Save