diff --git a/htroot/CrawlStartExpert_p.java b/htroot/CrawlStartExpert_p.java index b54233d0d..a6d18c56c 100644 --- a/htroot/CrawlStartExpert_p.java +++ b/htroot/CrawlStartExpert_p.java @@ -85,7 +85,7 @@ public class CrawlStartExpert_p { boolean collectionEnabled = sb.index.fulltext().getSolrScheme().isEmpty() || sb.index.fulltext().getSolrScheme().contains(YaCySchema.collection_sxt); prop.put("collectionEnabled", collectionEnabled ? 1 : 0); - prop.put("collection", collectionEnabled ? sb.getConfig("collection", "user") : ""); + prop.put("collection", collectionEnabled ? "user" : ""); // return rewrite properties return prop; diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index ee147811f..c54b08494 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -43,13 +43,8 @@ import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.ZURL.FailCategory; import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.SitemapImporter; -import net.yacy.data.BookmarkHelper; -import net.yacy.data.BookmarksDB; -import net.yacy.data.ListManager; import net.yacy.data.WorkTables; -import net.yacy.data.ymark.YMarkTables; import net.yacy.document.Document; -import net.yacy.document.Parser.Failure; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.TransformerWriter; import net.yacy.kelondro.data.meta.DigestURI; @@ -212,7 +207,7 @@ public class Crawler_p { boolean directDocByURL = "on".equals(post.get("directDocByURL", "off")); // catch also all linked media documents without loading them env.setConfig("crawlingDirectDocByURL", directDocByURL); - final String collection = post.get("collection", sb.getConfig("collection", "user")); + final String collection = post.get("collection", "user"); env.setConfig("collection", collection); // recrawl @@ -376,13 +371,10 @@ public class Crawler_p { // stack requests sb.crawler.putActive(handle, profile); sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); - Set successurls = new HashSet(); - Map failurls = new HashMap(); - String failreason; - for (DigestURI url: rootURLs) { - if ((failreason = stackUrl(sb, profile, url)) == null) successurls.add(url); else failurls.put(url, failreason); - } - + final Set successurls = new HashSet(); + final Map failurls = new HashMap(); + sb.stackURLs(rootURLs, profile, successurls, failurls); + if (failurls.size() == 0) { // liftoff! prop.put("info", "8"); @@ -552,106 +544,6 @@ public class Crawler_p { return prop; } - /** - * stack the url to the crawler - * @param sb - * @param profile - * @param url - * @return null if this was ok. If this failed, return a string with a fail reason - */ - private static String stackUrl(Switchboard sb, CrawlProfile profile, DigestURI url) { - - byte[] handle = ASCII.getBytes(profile.handle()); - - // remove url from the index to be prepared for a re-crawl - final byte[] urlhash = url.hash(); - sb.index.fulltext().remove(urlhash); - sb.crawlQueues.noticeURL.removeByURLHash(urlhash); - sb.crawlQueues.errorURL.remove(urlhash); - - // special handling of ftp protocol - if (url.isFTP()) { - try { - sb.crawler.putActive(handle, profile); - sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); - sb.crawlStacker.enqueueEntriesFTP(sb.peers.mySeed().hash.getBytes(), profile.handle(), url.getHost(), url.getPort(), false); - return null; - } catch (final Exception e) { - // mist - Log.logException(e); - return "problem crawling an ftp site: " + e.getMessage(); - } - } - - // get a scraper to get the title - Document scraper; - try { - scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); - } catch (IOException e) { - Log.logException(e); - return "scraper cannot load URL: " + e.getMessage(); - } - - final String title = scraper == null ? url.toNormalform(true) : scraper.dc_title(); - final String description = scraper.dc_description(); - - // add the url to the crawl stack - sb.crawler.removePassive(handle); // if there is an old entry, delete it - sb.crawler.putActive(handle, profile); - final String reasonString = sb.crawlStacker.stackCrawl(new Request( - sb.peers.mySeed().hash.getBytes(), - url, - null, - "CRAWLING-ROOT", - new Date(), - profile.handle(), - 0, - 0, - 0, - 0 - )); - - if (reasonString != null) return reasonString; - - // create a bookmark from crawl start url - //final Set tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart"))); - final Set tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart")); - tags.add("crawlStart"); - final String[] keywords = scraper.dc_subject(); - if (keywords != null) { - for (final String k: keywords) { - final String kk = BookmarkHelper.cleanTagsString(k); - if (kk.length() > 0) tags.add(kk); - } - } - String tagStr = tags.toString(); - if (tagStr.length() > 2 && tagStr.startsWith("[") && tagStr.endsWith("]")) tagStr = tagStr.substring(1, tagStr.length() - 2); - - // we will create always a bookmark to use this to track crawled hosts - final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(url.toNormalform(true), "admin"); - if (bookmark != null) { - bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, title); - bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_DESCRIPTION, description); - bookmark.setOwner("admin"); - bookmark.setPublic(false); - bookmark.setTags(tags, true); - sb.bookmarksDB.saveBookmark(bookmark); - } - - // do the same for ymarks - // TODO: could a non admin user add crawls? - try { - sb.tables.bookmarks.createBookmark(sb.loader, url, YMarkTables.USER_ADMIN, true, "crawlStart", "/Crawl Start"); - } catch (IOException e) { - Log.logException(e); - } catch (Failure e) { - Log.logException(e); - } - - // that was ok - return null; - } - private static long recrawlIfOlderC(final boolean recrawlIfOlderCheck, final int recrawlIfOlderNumber, final String crawlingIfOlderUnit) { if (!recrawlIfOlderCheck) return 0L; if ("year".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 365L; diff --git a/htroot/HostBrowser.html b/htroot/HostBrowser.html index 0bf14ab83..eb6247135 100644 --- a/htroot/HostBrowser.html +++ b/htroot/HostBrowser.html @@ -64,7 +64,9 @@ function updatepage(str) {
Host/URL: -
+ + #(delete)#::#(/delete)# +
diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index c1cdc5fb3..c13ba3ae0 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -20,10 +20,12 @@ import java.io.IOException; import java.net.MalformedURLException; +import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; @@ -91,6 +93,8 @@ public class HostBrowser { !path.startsWith("smb://") && !path.startsWith("file://"))) { path = "http://" + path; } prop.putHTML("path", path); + prop.put("delete", admin && path.length() > 0 ? 1 : 0); + DigestURI pathURI = null; try {pathURI = new DigestURI(path);} catch (MalformedURLException e) {} @@ -145,6 +149,12 @@ public class HostBrowser { } if (path.length() > 0) { + boolean delete = false; + if (admin && post.containsKey("delete")) { + // delete the complete path!! That includes everything that matches with this prefix. + delete = true; + } + boolean complete = post.getBoolean("complete"); if (complete) { // we want only root paths for complete lists p = path.indexOf('/', 10); @@ -174,10 +184,19 @@ public class HostBrowser { Set inboundLinks = new HashSet(); Map> outboundHosts = new HashMap>(); int hostsize = 0; + final List deleteIDs = new ArrayList(); while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName()); hostsize++; - if (complete || u.startsWith(path)) storedDocs.add(u); + if (u.startsWith(path)) { + if (delete) { + deleteIDs.add(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.name()))); + } else { + storedDocs.add(u); + } + } else if (complete) { + storedDocs.add(u); + } // collect inboundlinks to browse the host Iterator links = URIMetadataNode.getLinks(doc, true); while (links.hasNext()) { @@ -202,6 +221,7 @@ public class HostBrowser { } catch (MalformedURLException e) {} } } + if (deleteIDs.size() > 0) sb.index.fulltext().removeConcurrently(deleteIDs); // now combine both lists into one Map files = new HashMap(); diff --git a/source/net/yacy/kelondro/util/FileUtils.java b/source/net/yacy/kelondro/util/FileUtils.java index 368362015..cd10fb412 100644 --- a/source/net/yacy/kelondro/util/FileUtils.java +++ b/source/net/yacy/kelondro/util/FileUtils.java @@ -396,6 +396,10 @@ public final class FileUtils { return mb; } + private final static Pattern ps = Pattern.compile("\\\\"); + private final static Pattern pn = Pattern.compile("\\n"); + private final static Pattern pe = Pattern.compile("="); + public static void saveMap(final File file, final Map props, final String comment) { PrintWriter pw = null; final File tf = new File(file.toString() + "." + (System.currentTimeMillis() % 1000)); @@ -406,12 +410,16 @@ public final class FileUtils { for ( final Map.Entry entry : props.entrySet() ) { key = entry.getKey(); if ( key != null ) { - key = key.replace("\\", "\\\\").replace("\n", "\\n").replace("=", "\\="); + key = ps.matcher(key).replaceAll("\\\\"); + key = pn.matcher(key).replaceAll("\\n"); + key = pe.matcher(key).replaceAll("\\="); } if ( entry.getValue() == null ) { value = ""; } else { - value = entry.getValue().replace("\\", "\\\\").replace("\n", "\\n"); + value = entry.getValue(); + value = ps.matcher(value).replaceAll("\\\\"); + value = pn.matcher(value).replaceAll("\\n"); } pw.println(key + "=" + value); } @@ -432,7 +440,7 @@ public final class FileUtils { // ignore } } - + public static void saveMapB(final File file, final Map props, final String comment) { HashMap m = new HashMap(); for (Map.Entry e: props.entrySet()) m.put(e.getKey(), UTF8.String(e.getValue())); diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index 32bca5a1f..601b153b1 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -1061,7 +1061,7 @@ public final class Protocol // evaluate result List container = new ArrayList(); if (docList.size() > 0) {// create containers - Network.log.logInfo("SEARCH (solr), returned " + docList.size() + " documents from " + (target == null ? "shard" : ("peer " + target.hash + ":" + target.getName()))) ; + Network.log.logInfo("SEARCH (solr), returned " + docList.size() + " out of " + docList.getNumFound() + " documents from " + (target == null ? "shard" : ("peer " + target.hash + ":" + target.getName()))) ; int term = count; for (final SolrDocument doc: docList) { diff --git a/source/net/yacy/peers/RemoteSearch.java b/source/net/yacy/peers/RemoteSearch.java index 30056d76f..1ef635b7c 100644 --- a/source/net/yacy/peers/RemoteSearch.java +++ b/source/net/yacy/peers/RemoteSearch.java @@ -293,7 +293,7 @@ public class RemoteSearch extends Thread { } } }; - if (targetPeer == null) solr.run(); else solr.start(); + /*if (targetPeer == null) solr.run(); else*/ solr.start(); return solr; } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 5fc12c56a..ae7713fbc 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -60,6 +60,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; +import java.util.Set; import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeMap; @@ -124,6 +125,7 @@ import net.yacy.crawler.retrieval.Response; import net.yacy.crawler.robots.RobotsTxt; import net.yacy.data.BlogBoard; import net.yacy.data.BlogBoardComments; +import net.yacy.data.BookmarkHelper; import net.yacy.data.BookmarksDB; import net.yacy.data.ListManager; import net.yacy.data.MessageBoard; @@ -133,11 +135,13 @@ import net.yacy.data.WorkTables; import net.yacy.data.wiki.WikiBoard; import net.yacy.data.wiki.WikiCode; import net.yacy.data.wiki.WikiParser; +import net.yacy.data.ymark.YMarkTables; import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.LibraryProvider; import net.yacy.document.Parser; import net.yacy.document.TextParser; +import net.yacy.document.Parser.Failure; import net.yacy.document.content.DCEntry; import net.yacy.document.content.SurrogateReader; import net.yacy.document.importer.OAIListFriendsLoader; @@ -195,8 +199,7 @@ import net.yacy.utils.crypt; import com.google.common.io.Files; -public final class Switchboard extends serverSwitch -{ +public final class Switchboard extends serverSwitch { // load slots public static int xstackCrawlSlots = 2000; @@ -269,18 +272,12 @@ public final class Switchboard extends serverSwitch private final Semaphore shutdownSync = new Semaphore(0); private boolean terminate = false; - - //private Object crawlingPausedSync = new Object(); - //private boolean crawlingIsPaused = false; - + private static Switchboard sb; public HashMap crawlJobsStatus = new HashMap(); - private static Switchboard sb = null; - - public Switchboard(final File dataPath, final File appPath, final String initPath, final String configPath) - throws IOException { + public Switchboard(final File dataPath, final File appPath, final String initPath, final String configPath) throws IOException { super(dataPath, appPath, initPath, configPath); - + sb = this; // check if port is already occupied final int port = getConfigInt("port", 8090); try { @@ -294,7 +291,6 @@ public final class Switchboard extends serverSwitch } MemoryTracker.startSystemProfiling(); - sb = this; // set loglevel and log setLog(new Log("SWITCHBOARD")); @@ -374,9 +370,9 @@ public final class Switchboard extends serverSwitch // start indexing management this.log.logConfig("Starting Indexing Management"); final String networkName = getConfig(SwitchboardConstants.NETWORK_NAME, ""); - final long fileSizeMax = (OS.isWindows) ? sb.getConfigLong("filesize.max.win", Integer.MAX_VALUE) : sb.getConfigLong( "filesize.max.other", Integer.MAX_VALUE); - final int redundancy = (int) sb.getConfigLong("network.unit.dhtredundancy.senior", 1); - final int partitionExponent = (int) sb.getConfigLong("network.unit.dht.partitionExponent", 0); + final long fileSizeMax = (OS.isWindows) ? this.getConfigLong("filesize.max.win", Integer.MAX_VALUE) : this.getConfigLong( "filesize.max.other", Integer.MAX_VALUE); + final int redundancy = (int) this.getConfigLong("network.unit.dhtredundancy.senior", 1); + final int partitionExponent = (int) this.getConfigLong("network.unit.dht.partitionExponent", 0); this.networkRoot = new File(new File(indexPath, networkName), "NETWORK"); this.queuesRoot = new File(new File(indexPath, networkName), "QUEUES"); this.networkRoot.mkdirs(); @@ -1022,7 +1018,7 @@ public final class Switchboard extends serverSwitch "this is the content control import thread", null, new InstantBusyThread( - new ContentControlImportThread(sb), + new ContentControlImportThread(this), "run", SwitchboardConstants.PEER_PING_METHOD_JOBCOUNT, SwitchboardConstants.PEER_PING_METHOD_FREEMEM, @@ -1037,7 +1033,7 @@ public final class Switchboard extends serverSwitch "this is the content control filter update thread", null, new InstantBusyThread( - new ContentControlFilterUpdateThread(sb), + new ContentControlFilterUpdateThread(this), "run", SwitchboardConstants.PEER_PING_METHOD_JOBCOUNT, SwitchboardConstants.PEER_PING_METHOD_FREEMEM, @@ -1063,7 +1059,6 @@ public final class Switchboard extends serverSwitch this.trail = new LinkedBlockingQueue(); this.log.logConfig("Finished Switchboard Initialization"); - sb = this; } public int getIndexingProcessorsQueueSize() { @@ -1235,10 +1230,9 @@ public final class Switchboard extends serverSwitch final int wordCacheMaxCount = (int) getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 20000); final long fileSizeMax = - (OS.isWindows) ? sb.getConfigLong("filesize.max.win", Integer.MAX_VALUE) : sb - .getConfigLong("filesize.max.other", Integer.MAX_VALUE); - final int redundancy = (int) sb.getConfigLong("network.unit.dhtredundancy.senior", 1); - final int partitionExponent = (int) sb.getConfigLong("network.unit.dht.partitionExponent", 0); + (OS.isWindows) ? this.getConfigLong("filesize.max.win", Integer.MAX_VALUE) : this.getConfigLong("filesize.max.other", Integer.MAX_VALUE); + final int redundancy = (int) this.getConfigLong("network.unit.dhtredundancy.senior", 1); + final int partitionExponent = (int) this.getConfigLong("network.unit.dht.partitionExponent", 0); final String networkName = getConfig(SwitchboardConstants.NETWORK_NAME, ""); this.networkRoot = new File(new File(indexPrimaryPath, networkName), "NETWORK"); this.queuesRoot = new File(new File(indexPrimaryPath, networkName), "QUEUES"); @@ -1543,7 +1537,7 @@ public final class Switchboard extends serverSwitch public RankingProfile getRanking() { return (getConfig("rankingProfile", "").isEmpty()) ? new RankingProfile(Classification.ContentDomain.TEXT) - : new RankingProfile("", crypt.simpleDecode(sb.getConfig("rankingProfile", ""))); + : new RankingProfile("", crypt.simpleDecode(this.getConfig("rankingProfile", ""))); } /** @@ -1970,7 +1964,7 @@ public final class Switchboard extends serverSwitch // clear caches if necessary if ( !MemoryControl.request(8000000L, false) ) { - sb.index.fulltext().clearCache(); + this.index.fulltext().clearCache(); SearchEventCache.cleanupEvents(false); this.trail.clear(); } @@ -2246,7 +2240,7 @@ public final class Switchboard extends serverSwitch this.clusterhashes = this.peers.clusterHashes(getConfig("cluster.peers.yacydomain", "")); // check if we are reachable and try to map port again if not (e.g. when router rebooted) - if ( getConfigBool(SwitchboardConstants.UPNP_ENABLED, false) && sb.peers.mySeed().isJunior() ) { + if ( getConfigBool(SwitchboardConstants.UPNP_ENABLED, false) && this.peers.mySeed().isJunior() ) { UPnP.addPortMapping(); } @@ -2698,6 +2692,122 @@ public final class Switchboard extends serverSwitch } } + public void stackURLs(Set rootURLs, final CrawlProfile profile, final Set successurls, final Map failurls) { + List stackthreads = new ArrayList(); // do this concurrently + for (DigestURI url: rootURLs) { + final DigestURI turl = url; + Thread t = new Thread() { + public void run() { + String failreason; + if ((failreason = Switchboard.this.stackUrl(profile, turl)) == null) successurls.add(turl); else failurls.put(turl, failreason); + } + }; + t.start(); + stackthreads.add(t); + } + for (Thread t: stackthreads)try {t.join(5000);} catch (InterruptedException e) {} + } + + + /** + * stack the url to the crawler + * @param profile + * @param url + * @return null if this was ok. If this failed, return a string with a fail reason + */ + public String stackUrl(CrawlProfile profile, DigestURI url) { + + byte[] handle = ASCII.getBytes(profile.handle()); + + // remove url from the index to be prepared for a re-crawl + final byte[] urlhash = url.hash(); + this.index.fulltext().remove(urlhash); + this.crawlQueues.noticeURL.removeByURLHash(urlhash); + this.crawlQueues.errorURL.remove(urlhash); + + // special handling of ftp protocol + if (url.isFTP()) { + try { + this.crawler.putActive(handle, profile); + this.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); + this.crawlStacker.enqueueEntriesFTP(this.peers.mySeed().hash.getBytes(), profile.handle(), url.getHost(), url.getPort(), false); + return null; + } catch (final Exception e) { + // mist + Log.logException(e); + return "problem crawling an ftp site: " + e.getMessage(); + } + } + + // get a scraper to get the title + Document scraper; + try { + scraper = this.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); + } catch (IOException e) { + Log.logException(e); + return "scraper cannot load URL: " + e.getMessage(); + } + + final String title = scraper == null ? url.toNormalform(true) : scraper.dc_title(); + final String description = scraper.dc_description(); + + // add the url to the crawl stack + this.crawler.removePassive(handle); // if there is an old entry, delete it + this.crawler.putActive(handle, profile); + final String reasonString = this.crawlStacker.stackCrawl(new Request( + this.peers.mySeed().hash.getBytes(), + url, + null, + "CRAWLING-ROOT", + new Date(), + profile.handle(), + 0, + 0, + 0, + 0 + )); + + if (reasonString != null) return reasonString; + + // create a bookmark from crawl start url + //final Set tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart"))); + final Set tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart")); + tags.add("crawlStart"); + final String[] keywords = scraper.dc_subject(); + if (keywords != null) { + for (final String k: keywords) { + final String kk = BookmarkHelper.cleanTagsString(k); + if (kk.length() > 0) tags.add(kk); + } + } + String tagStr = tags.toString(); + if (tagStr.length() > 2 && tagStr.startsWith("[") && tagStr.endsWith("]")) tagStr = tagStr.substring(1, tagStr.length() - 2); + + // we will create always a bookmark to use this to track crawled hosts + final BookmarksDB.Bookmark bookmark = this.bookmarksDB.createBookmark(url.toNormalform(true), "admin"); + if (bookmark != null) { + bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, title); + bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_DESCRIPTION, description); + bookmark.setOwner("admin"); + bookmark.setPublic(false); + bookmark.setTags(tags, true); + this.bookmarksDB.saveBookmark(bookmark); + } + + // do the same for ymarks + // TODO: could a non admin user add crawls? + try { + this.tables.bookmarks.createBookmark(this.loader, url, YMarkTables.USER_ADMIN, true, "crawlStart", "/Crawl Start"); + } catch (IOException e) { + Log.logException(e); + } catch (Failure e) { + Log.logException(e); + } + + // that was ok + return null; + } + /** * load the content of a URL, parse the content and add the content to the index This process is started * concurrently. The method returns immediately after the call. @@ -2718,7 +2828,7 @@ public final class Switchboard extends serverSwitch return; // don't do double-work } final Request request = this.loader.request(url, true, true); - final CrawlProfile profile = sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); + final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle())); final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0); final String urls = url.toNormalform(true); if ( acceptedError != null ) { @@ -2793,7 +2903,7 @@ public final class Switchboard extends serverSwitch return; // don't do double-work } final Request request = this.loader.request(url, true, true); - final CrawlProfile profile = sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); + final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle())); final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0); if (acceptedError != null) { this.log.logInfo("addToCrawler: cannot load " @@ -2804,9 +2914,9 @@ public final class Switchboard extends serverSwitch } final String s; if (asglobal) { - s = sb.crawlQueues.noticeURL.push(StackType.GLOBAL, request, this.robots); + s = this.crawlQueues.noticeURL.push(StackType.GLOBAL, request, this.robots); } else { - s = sb.crawlQueues.noticeURL.push(StackType.LOCAL, request, this.robots); + s = this.crawlQueues.noticeURL.push(StackType.LOCAL, request, this.robots); } if (s != null) { @@ -3179,7 +3289,7 @@ public final class Switchboard extends serverSwitch if (links != null) { if (links.size() < 1000) { // limit to 1000 to skip large index pages final Iterator i = links.keySet().iterator(); - final boolean globalcrawljob = sb.getConfigBool("heuristic.searchresults.crawlglobal",false); + final boolean globalcrawljob = Switchboard.this.getConfigBool("heuristic.searchresults.crawlglobal",false); while (i.hasNext()) { url = DigestURI.toDigestURI(i.next()); boolean islocal = url.getHost().contentEquals(startUrl.getHost()); @@ -3239,7 +3349,7 @@ public final class Switchboard extends serverSwitch searchEvent.getRankingResult().oneFeederStarted(); try { final Response response = - sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay); + Switchboard.this.loader.load(Switchboard.this.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay); final byte[] resource = (response == null) ? null : response.getContent(); //System.out.println("BLEKKO: " + UTF8.String(resource)); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); @@ -3337,7 +3447,7 @@ public final class Switchboard extends serverSwitch if ( Thread.currentThread().isInterrupted() ) { break; } - seedListFileURL = sb.getConfig("network.unit.bootstrap.seedlist" + c, ""); + seedListFileURL = this.getConfig("network.unit.bootstrap.seedlist" + c, ""); if ( seedListFileURL.isEmpty() ) { break; } diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 980c7cddd..cf6ab1a8b 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -295,6 +295,13 @@ public final class Fulltext implements Iterable { if (MemoryControl.shortStatus()) clearCache(); } + public void removeConcurrently(final List deleteIDs) { + new Thread() { + public void run() {for (byte[] id: deleteIDs) {remove(id);}} + }.start(); + this.solr.commit(); + } + public boolean remove(final byte[] urlHash) { if (urlHash == null) return false; try { @@ -720,7 +727,7 @@ public final class Fulltext implements Iterable { } /** - * using a fragment of the url hash (5 bytes: bytes 6 to 10) it is possible to address all urls from a specific domain + * using a fragment of the url hash (6 bytes: bytes 6 to 11) it is possible to address all urls from a specific domain * here such a fragment can be used to delete all these domains at once * @param hosthash * @return number of deleted domains