diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index aa1abf100..827db3285 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -563,7 +563,7 @@ public class CrawlQueues { try { request.setStatus("loading", WorkflowJob.STATUS_RUNNING); final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); - Response response = sb.loader.load(request, true, maxFileSize); + Response response = sb.loader.load(request, maxFileSize); if (response == null) { request.setStatus("error", WorkflowJob.STATUS_FINISHED); if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)"); diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index 5d2917e0e..32e21bb88 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -28,7 +28,6 @@ import java.io.IOException; import java.util.Date; import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.repository.Blacklist; @@ -75,14 +74,14 @@ public final class HTTPLoader { this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000); } - public Response load(final Request entry, final boolean acceptOnlyParseable, long maxFileSize) throws IOException { + public Response load(final Request entry, long maxFileSize) throws IOException { long start = System.currentTimeMillis(); - Response doc = load(entry, acceptOnlyParseable, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize); + Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize); Latency.update(entry.url(), System.currentTimeMillis() - start); return doc; } - private Response load(final Request request, boolean acceptOnlyParseable, final int retryCount, final long maxFileSize) throws IOException { + private Response load(final Request request, final int retryCount, final long maxFileSize) throws IOException { if (retryCount < 0) { sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded"); @@ -96,15 +95,6 @@ public final class HTTPLoader { final boolean ssl = request.url().getProtocol().equals("https"); if (port < 0) port = (ssl) ? 443 : 80; - // if not the right file type then reject file - if (acceptOnlyParseable) { - String supportError = TextParser.supportsExtension(request.url()); - if (supportError != null) { - sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, supportError); - throw new IOException("REJECTED WRONG EXTENSION TYPE: " + supportError); - } - } - // check if url is in blacklist final String hostlow = host.toLowerCase(); if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) { @@ -138,15 +128,6 @@ public final class HTTPLoader { if (res.getStatusCode() == 200 || res.getStatusCode() == 203) { // the transfer is ok - if (acceptOnlyParseable) { - // if the response has not the right file type then reject file - String supportError = TextParser.supports(request.url(), res.getResponseHeader().mime()); - if (supportError != null) { - sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, supportError); - throw new IOException("REJECTED WRONG MIME TYPE, mime = " + res.getResponseHeader().mime() + ": " + supportError); - } - } - // we write the new cache entry to file system directly res.setAccountingName("CRAWLER"); final byte[] responseBody = res.getData(); @@ -202,7 +183,7 @@ public final class HTTPLoader { // retry crawling with new url request.redirectURL(redirectionUrl); - return load(request, acceptOnlyParseable, retryCount - 1, maxFileSize); + return load(request, retryCount - 1, maxFileSize); } } else { // if the response has not the right response type then reject file diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 08a48caf0..5341253b0 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -367,7 +367,7 @@ public final class Switchboard extends serverSwitch { indexSegments.segment(Segments.Process.LOCALCRAWLING), peers, true, - 30000); + 10000); // set up local robots.txt this.robotstxtConfig = RobotsTxtConfig.init(this); @@ -894,7 +894,7 @@ public final class Switchboard extends serverSwitch { indexSegments.segment(Segments.Process.LOCALCRAWLING), peers, true, - 30000); + 10000); // create new web structure this.webStructure = new WebStructureGraph(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", CRDistribution.CR_OWN), new File(queuesRoot, "webStructure.map")); diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 14a3e6c26..70b46b367 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -368,7 +368,7 @@ public final class yacyClient { } @SuppressWarnings("unchecked") - public static String[] search( + public static int search( final yacySeed mySeed, final String wordhashes, final String excludehashes, @@ -392,7 +392,6 @@ public final class yacyClient { final Bitfield constraint ) { // send a search request to peer with remote Hash - // this mainly converts the words into word hashes // INPUT: // iam : complete seed of the requesting peer @@ -437,7 +436,7 @@ public final class yacyClient { } catch (final IOException e) { yacyCore.log.logInfo("SEARCH failed, Peer: " + target.hash + ":" + target.getName() + " (" + e.getMessage() + "), score=" + target.selectscore); //yacyCore.peerActions.peerDeparture(target, "search request to peer created io exception: " + e.getMessage()); - return null; + return -1; } if (result == null || result.isEmpty()) { @@ -447,7 +446,7 @@ public final class yacyClient { + target.getName() + " (zero response), score=" + target.selectscore); - return null; + return -1; } // compute all computation times @@ -468,14 +467,14 @@ public final class yacyClient { // now create a plasmaIndex out of this result // System.out.println("yacyClient: " + ((urlhashes.length() == 0) ? "primary" : "secondary")+ " search result = " + result.toString()); // debug - int results = 0, joincount = 0; + int urlcount = 0, joincount = 0; try { - results = Integer.parseInt(result.get("count")); - joincount = Integer.parseInt(result.get("joincount")); + joincount = Integer.parseInt(result.get("joincount")); // the complete number of hits at remote site + urlcount = Integer.parseInt(result.get("count")); // the number of hits that are returned in the result list } catch (final NumberFormatException e) { yacyCore.log.logInfo("SEARCH failed FROM " + target.hash + ":" + target.getName() + ", wrong output format: " + e.getMessage()); //yacyCore.peerActions.peerDeparture(target, "search request to peer created number format exception"); - return null; + return -1; } // System.out.println("***result count " + results); @@ -488,14 +487,13 @@ public final class yacyClient { container[i] = ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, wordhashes.substring(i * Word.commonHashLength, (i + 1) * Word.commonHashLength).getBytes(), count); } catch (RowSpaceExceededException e) { Log.logException(e); - return null; + return -1; } } // insert results to containers URIMetadataRow urlEntry; - final String[] urls = new String[results]; - for (int n = 0; n < results; n++) { + for (int n = 0; n < urlcount; n++) { // get one single search result urlEntry = URIMetadataRow.importEntry(result.get("resource" + n)); if (urlEntry == null) continue; @@ -504,27 +502,26 @@ public final class yacyClient { final URIMetadataRow.Components metadata = urlEntry.metadata(); if (metadata == null) continue; if (blacklist.isListed(Blacklist.BLACKLIST_SEARCH, metadata.url())) { - if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search (client): filtered blacklisted url " + metadata.url() + " from peer " + target.getName()); + if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search: filtered blacklisted url " + metadata.url() + " from peer " + target.getName()); continue; // block with backlist } final String urlRejectReason = Switchboard.getSwitchboard().crawlStacker.urlInAcceptedDomain(metadata.url()); if (urlRejectReason != null) { - if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search (client): rejected url '" + metadata.url() + "' (" + urlRejectReason + ") from peer " + target.getName()); + if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search: rejected url '" + metadata.url() + "' (" + urlRejectReason + ") from peer " + target.getName()); continue; // reject url outside of our domain } // save the url entry - Reference entry; - if (urlEntry.word() == null) { - if (yacyCore.log.isWarning()) yacyCore.log.logWarning("remote search (client): no word attached from peer " + target.getName() + ", version " + target.getVersion()); + Reference entry = urlEntry.word(); + if (entry == null) { + if (yacyCore.log.isWarning()) yacyCore.log.logWarning("remote search: no word attached from peer " + target.getName() + ", version " + target.getVersion()); continue; // no word attached } // the search-result-url transports all the attributes of word indexes - entry = urlEntry.word(); if (!Base64Order.enhancedCoder.equal(entry.metadataHash(), urlEntry.hash())) { - if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search (client): url-hash " + new String(urlEntry.hash()) + " does not belong to word-attached-hash " + new String(entry.metadataHash()) + "; url = " + metadata.url() + " from peer " + target.getName()); + yacyCore.log.logInfo("remote search: url-hash " + new String(urlEntry.hash()) + " does not belong to word-attached-hash " + new String(entry.metadataHash()) + "; url = " + metadata.url() + " from peer " + target.getName()); continue; // spammed } @@ -554,23 +551,30 @@ public final class yacyClient { break; } } - - // store url hash for statistics - urls[n] = new String(urlEntry.hash()); } // store remote result to local result container synchronized (containerCache) { // insert one container into the search result buffer - containerCache.add(container[0], false, joincount); // one is enough - - // integrate remote topwords - final String references = result.get("references"); - yacyCore.log.logInfo("remote search (client): peer " + target.getName() + " sent references " + references); - if (references != null) { - // add references twice, so they can be counted (must have at least 2 entries) - containerCache.addTopic(references.split(",")); - containerCache.addTopic(references.split(",")); + containerCache.add(container[0], false, joincount); // one is enough, only the references are used, not the word + } + // insert the containers to the index + for (ReferenceContainer c: container) try { + indexSegment.termIndex().add(c); + } catch (Exception e) { + Log.logException(e); + } + yacyCore.log.logInfo("remote search: peer " + target.getName() + " sent " + container[0].size() + "/" + joincount + " references for joined word queries"); + + // integrate remote top-words/topics + final String references = result.get("references"); + if (references != null && references.length() > 0) { + yacyCore.log.logInfo("remote search: peer " + target.getName() + " sent topics: " + references); + // add references twice, so they can be counted (must have at least 2 entries) + String[] rs = references.split(","); + synchronized (containerCache) { + containerCache.addTopic(rs); + containerCache.addTopic(rs); } } @@ -592,7 +596,7 @@ public final class yacyClient { ci = new ByteBuffer(entry.getValue().getBytes("UTF-8")); } catch (UnsupportedEncodingException e) { Log.logException(e); - return null; + return -1; } //System.out.println("DEBUG-ABSTRACTFETCH: for word hash " + wordhash + " received " + ci.toString()); ReferenceContainer.decompressIndex(singleAbstract, ci, target.hash); @@ -600,14 +604,8 @@ public final class yacyClient { } } } + if (abstractCache.size() > 0) yacyCore.log.logInfo("remote search: peer " + target.getName() + " sent " + abstractCache.size() + " index abstracts"); } - - // insert the containers to the index - for (int m = 0; m < words; m++) try { - indexSegment.termIndex().add(container[m]); - } catch (Exception e) { - Log.logException(e); - } // generate statistics long searchtime; @@ -617,7 +615,7 @@ public final class yacyClient { searchtime = totalrequesttime; } if (yacyCore.log.isFine()) yacyCore.log.logFine("SEARCH " - + results + + urlcount + " URLS FROM " + target.hash + ":" @@ -627,7 +625,7 @@ public final class yacyClient { + ", searchtime=" + searchtime + ", netdelay=" + (totalrequesttime - searchtime) + ", references=" + result.get("references")); - return urls; + return urlcount; } public static Map permissionMessage(final yacySeedDB seedDB, final String targetHash) { diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index 1c764e7ff..19d10ca96 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -56,7 +56,7 @@ public class yacySearch extends Thread { final private Map> abstractCache; final private Blacklist blacklist; final private yacySeed targetPeer; - private String[] urls; + private int urls; private final int count, maxDistance; final private RankingProfile rankingProfile; final private Pattern prefer, filter; @@ -103,7 +103,7 @@ public class yacySearch extends Thread { this.abstractCache = abstractCache; this.blacklist = blacklist; this.targetPeer = targetPeer; - this.urls = null; + this.urls = -1; this.count = count; this.maxDistance = maxDistance; this.rankingProfile = rankingProfile; @@ -119,13 +119,11 @@ public class yacySearch extends Thread { count, maxDistance, global, partitions, targetPeer, indexSegment, crawlResults, containerCache, abstractCache, blacklist, rankingProfile, constraint); - if (urls != null) { + if (urls >= 0) { // urls is an array of url hashes. this is only used for log output - final StringBuilder urllist = new StringBuilder(this.urls.length * 13); - for (int i = 0; i < this.urls.length; i++) urllist.append(this.urls[i]).append(' '); - yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + urls.length + " links for word hash " + wordhashes + ": " + new String(urllist)); - peers.mySeed().incRI(urls.length); - peers.mySeed().incRU(urls.length); + //yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + urls.length + " links for word hash " + wordhashes + ": " + new String(urllist)); + peers.mySeed().incRI(urls); + peers.mySeed().incRU(urls); } else { yacyCore.log.logInfo("REMOTE SEARCH - no answer from remote peer " + targetPeer.hash + ":" + targetPeer.getName()); } @@ -144,7 +142,7 @@ public class yacySearch extends Thread { } public int links() { - return this.urls.length; + return this.urls; } public int count() { @@ -218,7 +216,8 @@ public class yacySearch extends Thread { seed = dhtEnum.next(); if (seed == null) continue; if (seed.matchPeerTags(wordhashes)) { - Log.logInfo("PLASMA", "selectPeers/PeerTags: " + seed.hash + ":" + seed.getName() + ", is specialized peer for " + seed.getPeerTags().toString()); + String specialized = seed.getPeerTags().toString(); + if (!specialized.equals("[*]")) Log.logInfo("PLASMA", "selectPeers/PeerTags: " + seed.hash + ":" + seed.getName() + ", is specialized peer for " + specialized); regularSeeds.remove(seed.hash); ranking.deleteScore(seed.hash); matchingSeeds.put(seed.hash, seed); @@ -335,7 +334,7 @@ public class yacySearch extends Thread { public static int collectedLinks(final yacySearch[] searchThreads) { int links = 0; for (int i = 0; i < searchThreads.length; i++) { - if (!(searchThreads[i].isAlive())) links += searchThreads[i].urls.length; + if (!(searchThreads[i].isAlive()) && searchThreads[i].urls > 0) links += searchThreads[i].urls; } return links; } diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index 0e325b3bd..1883aab80 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -69,6 +69,7 @@ public class htmlParser extends AbstractParser implements Idiom { SUPPORTED_EXTENSIONS.add("cfm"); SUPPORTED_EXTENSIONS.add("asp"); SUPPORTED_EXTENSIONS.add("aspx"); + SUPPORTED_EXTENSIONS.add("tex"); SUPPORTED_EXTENSIONS.add("txt"); SUPPORTED_EXTENSIONS.add("jsp"); SUPPORTED_EXTENSIONS.add("pl"); @@ -77,6 +78,7 @@ public class htmlParser extends AbstractParser implements Idiom { SUPPORTED_MIME_TYPES.add("text/xhtml+xml"); SUPPORTED_MIME_TYPES.add("application/xhtml+xml"); SUPPORTED_MIME_TYPES.add("application/x-httpd-php"); + SUPPORTED_MIME_TYPES.add("application/x-tex"); SUPPORTED_MIME_TYPES.add("text/plain"); SUPPORTED_MIME_TYPES.add("text/sgml"); SUPPORTED_MIME_TYPES.add("text/csv"); diff --git a/source/net/yacy/kelondro/data/word/WordReferenceVars.java b/source/net/yacy/kelondro/data/word/WordReferenceVars.java index dd02996de..6634ec560 100644 --- a/source/net/yacy/kelondro/data/word/WordReferenceVars.java +++ b/source/net/yacy/kelondro/data/word/WordReferenceVars.java @@ -393,6 +393,16 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc public static BlockingQueue transform(ReferenceContainer container) { LinkedBlockingQueue out = new LinkedBlockingQueue(); + if (container.size() <= 100) { + // transform without concurrency to omit thread creation overhead + for (Row.Entry entry: container) try { + out.put(new WordReferenceVars(new WordReferenceRow(entry))); + } catch (InterruptedException e) {} + try { + out.put(WordReferenceVars.poison); + } catch (InterruptedException e) {} + return out; + } Thread distributor = new TransformDistributor(container, out); distributor.start(); diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 77730fd32..a0f8e1148 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -104,7 +104,7 @@ public final class LoaderDispatcher { final boolean forText, final boolean global, final long maxFileSize) throws IOException { - return load(request(url, forText, global), forText, maxFileSize); + return load(request(url, forText, global), maxFileSize); } /** @@ -122,12 +122,12 @@ public final class LoaderDispatcher { final boolean global, CrawlProfile.CacheStrategy cacheStratgy, long maxFileSize) throws IOException { - return load(request(url, forText, global), forText, cacheStratgy, maxFileSize); + return load(request(url, forText, global), cacheStratgy, maxFileSize); } public void load(final DigestURI url, CrawlProfile.CacheStrategy cacheStratgy, long maxFileSize, File targetFile) throws IOException { - byte[] b = load(request(url, false, true), false, cacheStratgy, maxFileSize).getContent(); + byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize).getContent(); if (b == null) throw new IOException("load == null"); File tmp = new File(targetFile.getAbsolutePath() + ".tmp"); @@ -169,14 +169,14 @@ public final class LoaderDispatcher { 0); } - public Response load(final Request request, final boolean acceptOnlyParseable, long maxFileSize) throws IOException { + public Response load(final Request request, long maxFileSize) throws IOException { CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()); CrawlProfile.CacheStrategy cacheStrategy = CrawlProfile.CacheStrategy.IFEXIST; if (crawlProfile != null) cacheStrategy = crawlProfile.cacheStrategy(); - return load(request, acceptOnlyParseable, cacheStrategy, maxFileSize); + return load(request, cacheStrategy, maxFileSize); } - public Response load(final Request request, final boolean acceptOnlyParseable, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException { + public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException { // get the protocol of the next URL final String protocol = request.url().getProtocol(); final String host = request.url().getHost(); @@ -258,7 +258,7 @@ public final class LoaderDispatcher { // load resource from the internet Response response = null; - if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable, maxFileSize); + if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, maxFileSize); if (protocol.equals("ftp")) response = ftpLoader.load(request, true); if (protocol.equals("smb")) response = smbLoader.load(request, true); if (protocol.equals("file")) response = fileLoader.load(request, true);