diff --git a/build.properties b/build.properties index e7dcc5333..2c369a989 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.4 javacTarget=1.4 # Release Configuration -releaseVersion=0.385 +releaseVersion=0.386 releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz #releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr} diff --git a/htroot/IndexCreateIndexingQueue_p.html b/htroot/IndexCreateIndexingQueue_p.html index 9698621bc..5654f565f 100644 --- a/htroot/IndexCreateIndexingQueue_p.html +++ b/htroot/IndexCreateIndexingQueue_p.html @@ -20,7 +20,6 @@ There are #[num]# entries in the indexing queue:
Initiator Depth Modified Date -#HREF Anchor Name URL @@ -29,7 +28,6 @@ There are #[num]# entries in the indexing queue:
#[initiator]# #[depth]# #[modified]# -#[href]# #[anchor]# #[url]# diff --git a/htroot/IndexCreateIndexingQueue_p.java b/htroot/IndexCreateIndexingQueue_p.java index 078b1b260..accb2fda9 100644 --- a/htroot/IndexCreateIndexingQueue_p.java +++ b/htroot/IndexCreateIndexingQueue_p.java @@ -100,7 +100,6 @@ public class IndexCreateIndexingQueue_p { prop.put("indexing-queue_list_"+i+"_initiator", ((initiator == null) ? "proxy" : initiator.getName())); prop.put("indexing-queue_list_"+i+"_depth", pcentry.depth()); prop.put("indexing-queue_list_"+i+"_modified", (pcentry.responseHeader() == null) ? "null" : daydate(pcentry.responseHeader().lastModified())); - prop.put("indexing-queue_list_"+i+"_href", pcentry.forkFactor()); prop.put("indexing-queue_list_"+i+"_anchor", pcentry.anchorName()); prop.put("indexing-queue_list_"+i+"_url", pcentry.normalizedURLString()); dark = !dark; diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index 080f56a4d..7775ec8ef 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -443,6 +443,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt requestDate, // init date 0, // crawling depth url, // url + "", // name of the url is unknown requestHeader, // request headers "200 OK", // request status cachedResponseHeader, // response headers @@ -486,7 +487,6 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt GZIPOutputStream gzippedOut = null; httpChunkedOutputStream chunkedOut = null; OutputStream hfos = null; - htmlFilterContentScraper scraper = null; httpc remote = null; httpc.response res = null; @@ -568,7 +568,8 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt plasmaHTCache.Entry cacheEntry = cacheManager.newEntry( requestDate, 0, - url, + url, + "", requestHeader, res.status, res.responseHeader, @@ -576,33 +577,8 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt switchboard.defaultProxyProfile ); - // handle file types - if (((ext == null) || (!(plasmaParser.mediaExtContains(ext)))) && - (plasmaParser.realtimeParsableMimeTypesContains(res.responseHeader.mime()))) { - // this is a file that is a possible candidate for parsing by the indexer - if (transformer.isIdentityTransformer()) { - this.theLogger.logDebug("create passthrough (parse candidate) for url " + url); - // no transformation, only passthrough - // this isng especially the case if the bluelist is empty - // in that case, the content is not scraped here but later - hfos = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond); - } else { - // make a scraper and transformer - this.theLogger.logDebug("create scraper for url " + url); - scraper = new htmlFilterContentScraper(url); - hfos = new htmlFilterOutputStream((gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond), scraper, transformer, (ext.length() == 0)); - if (((htmlFilterOutputStream) hfos).binarySuspect()) { - scraper = null; // forget it, may be rubbish - this.theLogger.logDebug("Content of " + url + " is probably binary. deleted scraper."); - } - cacheEntry.scraper = scraper; - } - } else { - this.theLogger.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped"); - scraper = null; - hfos = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond); - cacheEntry.scraper = scraper; - } + // make output stream + hfos = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond); // handle incoming cookies handleIncomingCookies(res.responseHeader, host, ip); diff --git a/source/de/anomic/plasma/plasmaCrawlLoader.java b/source/de/anomic/plasma/plasmaCrawlLoader.java index a531eae67..d5c7d165b 100644 --- a/source/de/anomic/plasma/plasmaCrawlLoader.java +++ b/source/de/anomic/plasma/plasmaCrawlLoader.java @@ -132,15 +132,15 @@ public final class plasmaCrawlLoader extends Thread { public void close() { try { - // setting the stop flag to true - this.stopped = true; - - // interrupting the plasmaCrawlLoader - this.interrupt(); - - // waiting for the thread to finish ... + // setting the stop flag to true + this.stopped = true; + + // interrupting the plasmaCrawlLoader + this.interrupt(); + + // waiting for the thread to finish ... this.log.logInfo("Waiting for plasmaCrawlLoader shutdown ..."); - this.join(5000); + this.join(5000); } catch (Exception e) { // we where interrupted while waiting for the crawlLoader Thread to finish } @@ -186,6 +186,7 @@ public final class plasmaCrawlLoader extends Thread { public void loadParallel( URL url, + String name, String referer, String initiator, int depth, @@ -195,7 +196,7 @@ public final class plasmaCrawlLoader extends Thread { int crawlingPriority = 5; // creating a new crawler queue object - plasmaCrawlLoaderMessage theMsg = new plasmaCrawlLoaderMessage(url, referer,initiator,depth,profile, crawlingPriority); + plasmaCrawlLoaderMessage theMsg = new plasmaCrawlLoaderMessage(url, name, referer, initiator, depth, profile, crawlingPriority); // adding the message to the queue try { @@ -287,8 +288,7 @@ class CrawlerMessageQueue { } -final class CrawlerPool extends GenericObjectPool -{ +final class CrawlerPool extends GenericObjectPool { private final ThreadGroup theThreadGroup; public boolean isClosed = false; diff --git a/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java b/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java index 0f939453a..aaed6ce64 100644 --- a/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java +++ b/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java @@ -47,6 +47,7 @@ import java.net.URL; public final class plasmaCrawlLoaderMessage { public final int crawlingPriority; public final URL url; + public final String name; public final String referer; public final String initiator; public final int depth; @@ -54,13 +55,15 @@ public final class plasmaCrawlLoaderMessage { // loadParallel(URL url, String referer, String initiator, int depth, plasmaCrawlProfile.entry profile) { public plasmaCrawlLoaderMessage( - URL url, + URL url, + String name, String referer, String initiator, int depth, plasmaCrawlProfile.entry profile, int crawlingPriority) { this.url = url; + this.name = name; this.referer = referer; this.initiator = initiator; this.depth = depth; diff --git a/source/de/anomic/plasma/plasmaCrawlWorker.java b/source/de/anomic/plasma/plasmaCrawlWorker.java index 914999d83..c78820fb4 100644 --- a/source/de/anomic/plasma/plasmaCrawlWorker.java +++ b/source/de/anomic/plasma/plasmaCrawlWorker.java @@ -71,6 +71,7 @@ public final class plasmaCrawlWorker extends Thread { public plasmaCrawlLoaderMessage theMsg; private URL url; + private String name; private String referer; private String initiator; private int depth; @@ -125,6 +126,7 @@ public final class plasmaCrawlWorker extends Thread { this.theMsg = theMsg; this.url = theMsg.url; + this.name = theMsg.name; this.referer = theMsg.referer; this.initiator = theMsg.initiator; this.depth = theMsg.depth; @@ -198,7 +200,7 @@ public final class plasmaCrawlWorker extends Thread { public void execute() throws IOException { try { this.setName(this.threadBaseName + "_" + this.url); - load(this.url, this.referer, this.initiator, this.depth, this.profile, + load(this.url, this.name, this.referer, this.initiator, this.depth, this.profile, this.socketTimeout, this.remoteProxyHost, this.remoteProxyPort, this.remoteProxyUse, this.cacheManager, this.log); @@ -220,6 +222,7 @@ public final class plasmaCrawlWorker extends Thread { public static void load( URL url, + String name, String referer, String initiator, int depth, @@ -232,6 +235,7 @@ public final class plasmaCrawlWorker extends Thread { serverLog log ) throws IOException { load(url, + name, referer, initiator, depth, @@ -248,7 +252,8 @@ public final class plasmaCrawlWorker extends Thread { } private static void load( - URL url, + URL url, + String name, String referer, String initiator, int depth, @@ -300,7 +305,7 @@ public final class plasmaCrawlWorker extends Thread { long contentLength = res.responseHeader.contentLength(); // reserve cache entry - plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, requestHeader, res.status, res.responseHeader, initiator, profile); + plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, name, requestHeader, res.status, res.responseHeader, initiator, profile); // request has been placed and result has been returned. work off response File cacheFile = cacheManager.getCachePath(url); @@ -355,6 +360,7 @@ public final class plasmaCrawlWorker extends Thread { log.logInfo("Redirection detected ('" + res.status + "') for url " + url.toString() + "\nRedirecting request to: " + redirectionUrl); load(redirectionUrl, + name, referer, initiator, depth, @@ -383,18 +389,19 @@ public final class plasmaCrawlWorker extends Thread { log.logWarning("Problems detected while receiving gzip encoded content from '" + url.toString() + "'. Retrying request without using gzip content encoding."); load(url, - referer, - initiator, - depth, - profile, - socketTimeout, - remoteProxyHost, - remoteProxyPort, - remoteProxyUse, - cacheManager, - log, - 0, - false + name, + referer, + initiator, + depth, + profile, + socketTimeout, + remoteProxyHost, + remoteProxyPort, + remoteProxyUse, + cacheManager, + log, + 0, + false ); } else { // this may happen if the targeted host does not exist or anything with the diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 2e47a39ed..49dc6e5d4 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -431,13 +431,12 @@ public final class plasmaHTCache { (urlString.toLowerCase().indexOf(".exe") >= 0)); } - public Entry newEntry(Date initDate, int depth, URL url, + public Entry newEntry(Date initDate, int depth, URL url, String name, httpHeader requestHeader, String responseStatus, httpHeader responseHeader, String initiator, plasmaCrawlProfile.entry profile) { - //System.out.println("NEW ENTRY: " + url.toString()); // DEBUG - return new Entry(initDate, depth, url, requestHeader, responseStatus, responseHeader, initiator, profile); + return new Entry(initDate, depth, url, name, requestHeader, responseStatus, responseHeader, initiator, profile); } public final class Entry { @@ -451,6 +450,7 @@ public final class plasmaHTCache { public File cacheFile; // the cache file public byte[] cacheArray; // or the cache as byte-array public URL url; + public String name; // the name of the link, read as anchor from an -tag public String nomalizedURLHash; public String nomalizedURLString; public int status; // cache load/hit/stale etc status @@ -459,10 +459,9 @@ public final class plasmaHTCache { public String language; public plasmaCrawlProfile.entry profile; private String initiator; - public htmlFilterContentScraper scraper; - public Entry(Date initDate, int depth, URL url, + public Entry(Date initDate, int depth, URL url, String name, httpHeader requestHeader, String responseStatus, httpHeader responseHeader, String initiator, @@ -476,6 +475,7 @@ public final class plasmaHTCache { System.out.println("internal error at httpdProxyCache.Entry: " + e); System.exit(-1); } + this.name = name; this.cacheFile = getCachePath(this.url); this.nomalizedURLHash = plasmaCrawlLURL.urlHash(nomalizedURLString); @@ -510,9 +510,11 @@ public final class plasmaHTCache { // to be defined later: this.cacheArray = null; this.status = CACHE_UNFILLED; - this.scraper = null; } + public String name() { + return name; + } public String initiator() { return initiator; } diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 8f410c37d..9d35255e6 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -354,7 +354,8 @@ public class plasmaSnippetCache { private void loadResourceFromWeb(URL url, int socketTimeout) throws IOException { plasmaCrawlWorker.load( - url, + url, + "", null, null, 0, diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 18a403fda..90c898e17 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -264,7 +264,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser this.parser = new plasmaParser(); // initialize switchboard queue - sbQueue = new plasmaSwitchboardQueue(this.cacheManager, urlPool.loadedURL, new File(plasmaPath, "switchboardQueue0.stack"), 10, profiles); + sbQueue = new plasmaSwitchboardQueue(this.cacheManager, urlPool.loadedURL, new File(plasmaPath, "switchboardQueue1.stack"), 10, profiles); // define an extension-blacklist log.logSystem("Parser: Initializing Extension Mappings for Media/Parser"); @@ -453,8 +453,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser cacheManager.push(entry); } - - synchronized public boolean htEntryStoreProcess(plasmaHTCache.Entry entry) throws IOException { if (entry == null) return false; @@ -480,9 +478,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser enQueue(sbQueue.newEntry(entry.url, plasmaURL.urlHash(entry.referrerURL()), entry.requestHeader.ifModifiedSince(), entry.requestHeader.containsKey(httpHeader.COOKIE), entry.initiator(), entry.depth, entry.profile.handle(), - (entry.scraper == null) ? 0 : entry.scraper.getAnchors().size(), - (entry.scraper == null) ? 0 : entry.scraper.getImages().size(), - (entry.scraper == null) ? "" : entry.scraper.getHeadline() + entry.name() )); } else if (entry.status == plasmaHTCache.CACHE_PASSING) { // even if the file should not be stored in the cache, it can be used to be indexed @@ -492,9 +488,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser enQueue(sbQueue.newEntry(entry.url, plasmaURL.urlHash(entry.referrerURL()), entry.requestHeader.ifModifiedSince(), entry.requestHeader.containsKey(httpHeader.COOKIE), entry.initiator(), entry.depth, entry.profile.handle(), - (entry.scraper == null) ? 0 : entry.scraper.getAnchors().size(), - (entry.scraper == null) ? 0 : entry.scraper.getImages().size(), - (entry.scraper == null) ? "" : entry.scraper.getHeadline() + entry.name() )); } @@ -504,9 +498,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser case plasmaHTCache.CACHE_UNFILLED: log.logInfo("CACHE UNFILLED: " + entry.cacheFile); break; case plasmaHTCache.CACHE_FILL: - log.logInfo("CACHE FILL: " + entry.cacheFile + - ((entry.cacheArray == null) ? "" : " (cacheArray is filled)") + - ((entry.scraper == null) ? "" : " (scraper is filled)")); + log.logInfo("CACHE FILL: " + entry.cacheFile + ((entry.cacheArray == null) ? "" : " (cacheArray is filled)")); break; case plasmaHTCache.CACHE_HIT: log.logInfo("CACHE HIT: " + entry.cacheFile); break; @@ -574,27 +566,16 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser profiles.close(); parser.close(); cacheManager.close(); + sbQueue.close(); } catch (IOException e) {} log.logSystem("SWITCHBOARD SHUTDOWN TERMINATED"); } - - /* - public int totalSize() { - return processStack.size() + cacheLoader.size() + noticeURL.stackSize(); - } - */ public int queueSize() { return sbQueue.size(); //return processStack.size() + cacheLoader.size() + noticeURL.stackSize(); } - /* - public int lUrlSize() { - return urlPool.loadedURL.size(); - } - */ - public int cacheSizeMin() { return wordIndex.size(); } @@ -812,13 +793,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // alternatively do a local crawl - if (sbQueue.size() >= crawlSlots) { - log.logDebug("LimitCrawl: too many processes in queue, dismissed (" + + if (sbQueue.size() >= indexingSlots) { + log.logDebug("LimitCrawl: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sbQueue.size() + ")"); return false; } if (cacheLoader.size() >= crawlSlots) { - log.logDebug("LimitCrawl: too many loader in queue, dismissed (" + + log.logDebug("LimitCrawl: too many processes in loader queue, dismissed (" + "cacheLoader=" + cacheLoader.size() + ")"); return false; } @@ -924,7 +905,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // parse content plasmaParserDocument document = null; - if ((plasmaParser.supportedFileExt(entry.url())) || ((entry.responseHeader() != null) && (plasmaParser.supportedMimeTypesContains(entry.responseHeader().mime())))) { @@ -944,8 +924,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser return; } - Date loadDate = entry.responseHeader().lastModified(); - if (loadDate == null) loadDate = entry.responseHeader().date(); + Date loadDate = null; + if (entry.responseHeader() != null) { + loadDate = entry.responseHeader().lastModified(); + if (loadDate == null) loadDate = entry.responseHeader().date(); + } if (loadDate == null) loadDate = new Date(); // put anchors on crawl stack @@ -1055,7 +1038,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // explicit delete/free resources + if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) cacheManager.deleteFile(entry.url()); document = null; entry = null; + + } catch (IOException e) { log.logError("ERROR in plasmaSwitchboard.process(): " + e.toString()); } @@ -1166,7 +1152,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser log.logInfo(stats + ": urlEntry=null"); return; } - cacheLoader.loadParallel(urlEntry.url(), urlEntry.referrerHash(), urlEntry.initiator(), urlEntry.depth(), profile); + cacheLoader.loadParallel(urlEntry.url(), urlEntry.name(), urlEntry.referrerHash(), urlEntry.initiator(), urlEntry.depth(), profile); log.logInfo(stats + ": enqueued for load " + urlEntry.url()); return; } diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java index e0987d93a..5b8df40ed 100644 --- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java +++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java @@ -76,8 +76,6 @@ public class plasmaSwitchboardQueue { yacySeedDB.commonHashLength, plasmaURL.urlCrawlDepthLength, plasmaURL.urlCrawlProfileHandleLength, - plasmaURL.urlForkFactorLength, - plasmaURL.urlForkFactorLength, plasmaURL.urlDescrLength }); @@ -96,8 +94,6 @@ public class plasmaSwitchboardQueue { (entry.initiator == null) ? plasmaURL.dummyHash.getBytes() : entry.initiator.getBytes(), serverCodings.enhancedCoder.encodeBase64Long((long) entry.depth, plasmaURL.urlCrawlDepthLength).getBytes(), (entry.profileHandle == null) ? plasmaURL.dummyHash.getBytes() : entry.profileHandle.getBytes(), - serverCodings.enhancedCoder.encodeBase64Long((long) entry.hrefCount, plasmaURL.urlForkFactorLength).getBytes(), - serverCodings.enhancedCoder.encodeBase64Long((long) entry.imageCount, plasmaURL.urlForkFactorLength).getBytes(), (entry.anchorName == null) ? "-".getBytes() : entry.anchorName.getBytes() }); } @@ -126,9 +122,8 @@ public class plasmaSwitchboardQueue { } public Entry newEntry(URL url, String referrer, Date ifModifiedSince, boolean requestWithCookie, - String initiator, int depth, String profilehandle, - int hrefCount, int imageCount, String anchorName) { - return new Entry(url, referrer, ifModifiedSince, requestWithCookie, initiator, depth, profilehandle, hrefCount, imageCount, anchorName); + String initiator, int depth, String profilehandle, String anchorName) { + return new Entry(url, referrer, ifModifiedSince, requestWithCookie, initiator, depth, profilehandle, anchorName); } public class Entry { @@ -139,8 +134,6 @@ public class plasmaSwitchboardQueue { private String initiator; // yacySeedDB.commonHashLength private int depth; // plasmaURL.urlCrawlDepthLength private String profileHandle; // plasmaURL.urlCrawlProfileHandleLength - private int hrefCount; // plasmaURL.urlForkFactorLength - private int imageCount; // plasmaURL.urlForkFactorLength private String anchorName; // plasmaURL.urlDescrLength // computed values @@ -149,8 +142,7 @@ public class plasmaSwitchboardQueue { private URL referrerURL; public Entry(URL url, String referrer, Date ifModifiedSince, boolean requestWithCookie, - String initiator, int depth, String profileHandle, - int hrefCount, int imageCount, String anchorName) { + String initiator, int depth, String profileHandle, String anchorName) { this.url = url; this.referrerHash = referrer; this.ifModifiedSince = ifModifiedSince; @@ -158,8 +150,6 @@ public class plasmaSwitchboardQueue { this.initiator = initiator; this.depth = depth; this.profileHandle = profileHandle; - this.hrefCount = hrefCount; - this.imageCount = imageCount; this.anchorName = anchorName; this.profileEntry = null; @@ -181,9 +171,7 @@ public class plasmaSwitchboardQueue { this.initiator = new String(row[4]); this.depth = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(row[5])); this.profileHandle = new String(row[6]); - this.hrefCount = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(row[7])); - this.imageCount = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(row[8])); - this.anchorName = new String(row[9]); + this.anchorName = new String(row[7]); this.profileEntry = null; this.responseHeader = null; @@ -248,14 +236,6 @@ public class plasmaSwitchboardQueue { } return referrerURL; } - - public int forkFactor() { - return hrefCount; - } - - public int images() { - return imageCount; - } public String anchorName() { return anchorName;