fixed post-parsing (a case when the bluelist is empty)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@41 6c8d7289-2bf4-0310-a012-ef5d649a1542
20 years ago · 00f223cfc1
parent 044b93412a
commit 00f223cfc1
9 changed files with 80 additions and 43 deletions
--- a/htroot/Config_p.html
+++ b/htroot/Config_p.html
@ -15,10 +15,11 @@ document.getElementById("value").value=element.value;
 <body marginheight="0" marginwidth="0" leftmargin="0" topmargin="0">
 #[header]#
 <br><br>
-<h2>advanced Config</h2>
+<h2>Advanced Config</h2>
 <p>
 Here are all Config Options from YaCy.<br>
 You can change anything, but some Options need a restart, and some Options can crash YaCy, when wrong values are used.
+For explanation please look into yacy.init
 </p>
 <select name="options" size="25" style="width: 600">
 #{options}#<option id="#[key]#" value="#[value]#" onclick="element_clicked(this)">#[key]#: #[value]#</option>
--- a/makerelease.sh
+++ b/makerelease.sh
@ -45,7 +45,7 @@
 # Contributions and changes to the program code must be marked as such.

 # define variables
-version='0.363'
+version='0.365'
 datestr=`date +%Y%m%d`
 #release='yacy_v'$version'_'$datestr
 release='yacy_dev_v'$version'_'$datestr
--- a/source/de/anomic/http/httpc.java
+++ b/source/de/anomic/http/httpc.java
@ -468,7 +468,7 @@ public final class httpc {
 	    // and change the Content-Encoding and Content-Length attributes in the header
 	    byte[] buffer = new byte[2048];
 	    int l;
-        long len = 0;
+            long len = 0;
                
 	    // find out length
 	    long length = responseHeader.contentLength();
--- a/source/de/anomic/http/httpdProxyHandler.java
+++ b/source/de/anomic/http/httpdProxyHandler.java
@ -348,16 +348,16 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
        
        // attach possible yacy-sublevel-domain
        if ((yAddress != null) &&
-        ((pos = yAddress.indexOf("/")) >= 0) &&
-        (!(remotePath.startsWith("/env"))) // this is the special path, staying always at root-level
-        ) remotePath = yAddress.substring(pos) + remotePath;
+	    ((pos = yAddress.indexOf("/")) >= 0) &&
+	    (!(remotePath.startsWith("/env"))) // this is the special path, staying always at root-level
+	    ) remotePath = yAddress.substring(pos) + remotePath;
        
        // decide wether to use a cache entry or connect to the network
        File cacheFile = cacheManager.getCachePath(url);
        String urlHash = plasmaCrawlLURL.urlHash(url);
        httpHeader cachedResponseHeader = null;
        boolean cacheExists = ((cacheFile.exists()) && (cacheFile.isFile()) &&
-        ((cachedResponseHeader = cacheManager.getCachedResponse(urlHash)) != null));
+			       ((cachedResponseHeader = cacheManager.getCachedResponse(urlHash)) != null));
        
        // why are files unzipped upon arrival? why not zip all files in cache?
        // This follows from the following premises
@ -381,9 +381,9 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
        // 4. cache stale - refill - superfluous
        // in two of these cases we trigger a scheduler to handle newly arrived files:
        // case 1 and case 3
-        plasmaHTCache.Entry hpc;
+        plasmaHTCache.Entry cacheEntry;
        if ((cacheExists) &&
-            ((hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK",
+            ((cacheEntry = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK",
                                          cachedResponseHeader, null,
                                          switchboard.defaultProxyProfile)).shallUseCache())) {
            // we respond on the request by using the cache, the cache is fresh
@ -466,29 +466,34 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
            long contentLength = res.responseHeader.contentLength();
            
            // reserver cache entry
-            hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile);
+            cacheEntry = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile);
            
            // handle file types
            if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
                (httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) {
+		// this is a file that is a possible candidate for parsing by the indexer
                if (transformer.isIdentityTransformer()) {
+		    log.logDebug("create passthrough (parse candidate) for url " + url);
                    // no transformation, only passthrough
+		    // this is especially the case if the bluelist is empty
+		    // in that case, the content is not scraped here but later
                    hfos = respond;
                } else {
                    // make a scraper and transformer
+		    log.logDebug("create scraper for url " + url);
                    scraper = new htmlFilterContentScraper(url);
                    hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0));
                    if (((htmlFilterOutputStream) hfos).binarySuspect()) {
                        scraper = null; // forget it, may be rubbish
                        log.logDebug("Content of " + url + " is probably binary. deleted scraper.");
                    }
-                    hpc.scraper = scraper;
+                    cacheEntry.scraper = scraper;
                }
            } else {
                log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped");
                scraper = null;
                hfos = respond;
-                hpc.scraper = scraper;
+                cacheEntry.scraper = scraper;
            }
            
            // handle incoming cookies
@ -498,47 +503,52 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
            try {
                respondHeader(respond, res.status, res.responseHeader);
                String storeError;
-                if ((storeError = hpc.shallStoreCache()) == null) {
+                if ((storeError = cacheEntry.shallStoreCache()) == null) {
                    // we write a new cache entry
                    if ((contentLength > 0) && // known
                        (contentLength < 1048576)) {// 1 MB
                        // ok, we don't write actually into a file, only to RAM, and schedule writing the file.
-                        byte[] cacheArray;
-                        cacheArray = res.writeContent(hfos);
+                        byte[] cacheArray = res.writeContent(hfos);
+			log.logDebug("writeContent of " + url + " produced cacheArray = " + ((cacheArray == null) ? "null" : ("size=" + cacheArray.length)));
+
                        if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
                        
                        if (sizeBeforeDelete == -1) {
                            // totally fresh file
-                            hpc.status = plasmaHTCache.CACHE_FILL; // it's an insert
-                            cacheManager.stackProcess(hpc, cacheArray);
+                            cacheEntry.status = plasmaHTCache.CACHE_FILL; // it's an insert
+                            cacheManager.stackProcess(cacheEntry, cacheArray);
                        } else if (sizeBeforeDelete == cacheArray.length) {
                            // before we came here we deleted a cache entry
                            cacheArray = null;
-                            hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
-                            cacheManager.stackProcess(hpc); // unnecessary update
+                            cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
+                            cacheManager.stackProcess(cacheEntry); // unnecessary update
                        } else {
                            // before we came here we deleted a cache entry
-                            hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
-                            cacheManager.stackProcess(hpc, cacheArray); // necessary update, write response header to cache
+                            cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
+                            cacheManager.stackProcess(cacheEntry, cacheArray); // necessary update, write response header to cache
                        }
                    } else {
-                        // the file is too big to cache it in the ram, write to file right here
+                        // the file is too big to cache it in the ram, or the size is unknown
+			// write to file right here.
                        cacheFile.getParentFile().mkdirs();
                        res.writeContent(hfos, cacheFile);
                        if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
+			log.logDebug("for write-file of " + url + ": contentLength = " + contentLength + ", sizeBeforeDelete = " + sizeBeforeDelete);
                        if (sizeBeforeDelete == -1) {
                            // totally fresh file
-                            hpc.status = plasmaHTCache.CACHE_FILL; // it's an insert
-                            cacheManager.stackProcess(hpc);
+                            cacheEntry.status = plasmaHTCache.CACHE_FILL; // it's an insert
+                            cacheManager.stackProcess(cacheEntry);
                        } else if (sizeBeforeDelete == cacheFile.length()) {
                            // before we came here we deleted a cache entry
-                            hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
-                            cacheManager.stackProcess(hpc); // unnecessary update
+                            cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
+                            cacheManager.stackProcess(cacheEntry); // unnecessary update
                        } else {
                            // before we came here we deleted a cache entry
-                            hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
-                            cacheManager.stackProcess(hpc); // necessary update, write response header to cache
+                            cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
+                            cacheManager.stackProcess(cacheEntry); // necessary update, write response header to cache
                        }
+			// beware! all these writings will not fill the cacheEntry.cacheArray
+			// that means they are not available for the indexer (except they are scraped before)
                    }
                } else {
                    // no caching
@ -547,12 +557,12 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
                    if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
                    if (sizeBeforeDelete == -1) {
                        // no old file and no load. just data passing
-                        hpc.status = plasmaHTCache.CACHE_PASSING;
-                        cacheManager.stackProcess(hpc);
+                        cacheEntry.status = plasmaHTCache.CACHE_PASSING;
+                        cacheManager.stackProcess(cacheEntry);
                    } else {
                        // before we came here we deleted a cache entry
-                        hpc.status = plasmaHTCache.CACHE_STALE_NO_RELOAD;
-                        cacheManager.stackProcess(hpc);
+                        cacheEntry.status = plasmaHTCache.CACHE_STALE_NO_RELOAD;
+                        cacheManager.stackProcess(cacheEntry);
                    }
                }
            } catch (SocketException e) {
--- a/source/de/anomic/plasma/plasmaHTCache.java
+++ b/source/de/anomic/plasma/plasmaHTCache.java
@ -251,7 +251,9 @@ public final class plasmaHTCache {
                    entry.cacheFile.delete();
                }
 		entry.cacheFile.getParentFile().mkdirs();
+		log.logInfo("WRITE FILE (" + entry.cacheArray.length + " bytes) " + entry.cacheFile);
                serverFileUtils.write(entry.cacheArray, entry.cacheFile);
+		log.logInfo("AFTER WRITE cacheArray = " + entry.cacheFile + ": " + ((entry.cacheArray == null) ? "empty" : "full"));
 		//entry.cacheArray = null;
 	    } catch (FileNotFoundException e) {
 		// this is the case of a "(Not a directory)" error, which should be prohibited
@ -280,7 +282,10 @@ public final class plasmaHTCache {
 	    case CACHE_UNFILLED:
 		log.logInfo("CACHE UNFILLED: " + entry.cacheFile); break;
 	    case CACHE_FILL:
-		log.logInfo("CACHE FILL: " + entry.cacheFile); break;
+		log.logInfo("CACHE FILL: " + entry.cacheFile +
+			    ((entry.cacheArray == null) ? "" : " (cacheArray is filled)") +
+			    ((entry.scraper    == null) ? "" : " (scraper is filled)"));
+			    break;
 	    case CACHE_HIT:
 		log.logInfo("CACHE HIT: " + entry.cacheFile); break;
 	    case CACHE_STALE_NO_RELOAD:
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -42,10 +42,11 @@

 package de.anomic.plasma;

-import de.anomic.htmlFilter.*;
 import java.io.*;
 import java.net.*;
 import java.util.*;
+import de.anomic.server.*;
+import de.anomic.htmlFilter.*;

 public class plasmaParser {
    
@ -70,6 +71,18 @@ public class plasmaParser {
            return null;
        }
    }
+
+    public document parseSource(URL location, String mimeType, File sourceFile) {
+        // make a scraper and transformer
+        htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
+        OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
+        try {
+	    serverFileUtils.copy(sourceFile, hfos);
+            return transformScraper(location, mimeType, scraper);
+        } catch (IOException e) {
+            return null;
+        }
+    }
    
    public document transformScraper(URL location, String mimeType, htmlFilterContentScraper scraper) {
        try {
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -469,10 +469,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
    private synchronized void processResourceStack(plasmaHTCache.Entry entry) {
        // work off one stack entry with a fresh resource (scraped web page)
        String stats = "DEQUEUE: dequeueing one step (processStack=" + processStack.size() + ", localStackSize=" + noticeURL.localStackSize() + ", remoteStackSize=" + noticeURL.remoteStackSize() + ")";
-        if ((entry.cacheArray == null) && (entry.scraper == null)) {
-            log.logDebug(stats + " entry for " + entry.nomalizedURLString + " has no content -- skipped");
-            return;
-        }   
+
        try {
    
            // we must distinguish the following cases: resource-load was initiated by
@ -504,10 +501,22 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
            if (entry.scraper != null) {
                log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is pre-parsed by scraper");
                document = parser.transformScraper(entry.url, entry.responseHeader.mime(), entry.scraper);
-            } else {
-                log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is not parsed, parsing now");
+            } else if (entry.cacheArray != null) {
+                log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is not parsed yet, parsing now from cacheArray");
                document = parser.parseSource(entry.url, entry.responseHeader.mime(), entry.cacheArray);
-            }
+            } else {
+		if (entry.cacheFile.exists()) {
+		    log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is not parsed yet, parsing now from File");
+		    document = parser.parseSource(entry.url, entry.responseHeader.mime(), entry.cacheFile);
+		} else {
+		    log.logDebug("(Parser) '" + entry.nomalizedURLString + "' cannot be parsed, no resource available");
+		    return;
+		}
+	    }
+	    if (document == null) {
+		log.logError("(Parser) '" + entry.nomalizedURLString + "' parse failure");
+		return;
+	    }
            
            // put anchors on crawl stack
            if (((processCase == 4) || (processCase == 5)) &&
--- a/yacy.blue
+++ b/yacy.blue
@ -1 +0,0 @@
-testblue
--- a/yacy.init
+++ b/yacy.init
@ -92,7 +92,7 @@ parseableMime=application/xhtml+xml,text/html,text/plain
 # a comma-separated list of extensions that denote media file formats
 # this is important to recognize <a href> - tags as not-html reference
 # These files will be excluded from indexing
-mediaExt=swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar,sit,hqx,img,dmg,tar,gz,ps,pdf,doc,xls,ppt,ram,bz2,arj,jar,deb,torrent,ogg,iso,bin,ace,tgz,rpm
+mediaExt=swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar,sit,hqx,img,dmg,tar,gz,ps,pdf,doc,xls,ppt,ram,bz2,arj,jar,deb,torrent,ogg,iso,bin,ace,tgz,rpm,css

 # the proxy's and indexing maximum ram cache size in megabytes
 ramCacheSize = 12