fixed starting crawl results in "No parser available to parse mimetype 'application/octet-stream'"

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5047 6c8d7289-2bf4-0310-a012-ef5d649a1542
17 years ago · a087090bbb
parent 7e7e6a099a
commit a087090bbb
5 changed files with 22 additions and 13 deletions
--- a/source/de/anomic/crawler/HTTPLoader.java
+++ b/source/de/anomic/crawler/HTTPLoader.java
@ -187,7 +187,8 @@ public final class HTTPLoader {
                    if (plasmaParser.supportedContent(parserMode, entry.url(), res.getResponseHeader().mime())) {
                        // delete old content
                        if (cacheFile.isFile()) {
-                            plasmaHTCache.deleteURLfromCache(entry.url());
+                            // TODO why is content and metadata so separated? htcache holds metadata, but deleteURLfromCache deletes it???
+                            plasmaHTCache.deleteURLfromCache(entry.url(), true);
                        }
                        
                        // create parent directories
@ -217,7 +218,7 @@ public final class HTTPLoader {
                            }

                            // we write the new cache entry to file system directly
-                            (res).setAccountingName("CRAWLER");
+                            res.setAccountingName("CRAWLER");
                            final byte[] responseBody = res.getData();
                            fos.write(responseBody);
                            htCache.setCacheArray(responseBody);
--- a/source/de/anomic/plasma/cache/http/ResourceInfo.java
+++ b/source/de/anomic/plasma/cache/http/ResourceInfo.java
@ -59,8 +59,8 @@ public class ResourceInfo implements IResourceInfo {
    }

    public ResourceInfo(final yacyURL objectURL, final httpHeader requestHeaders, final httpHeader responseHeaders) {
-        if (objectURL == null) throw new NullPointerException();
-        if (responseHeaders == null) throw new NullPointerException();  
+        if (objectURL == null) throw new NullPointerException("objectURL == null");
+        if (responseHeaders == null) throw new NullPointerException("responseHeader == null");
        
        // generating the url hash
        this.url = objectURL;
--- a/source/de/anomic/plasma/plasmaHTCache.java
+++ b/source/de/anomic/plasma/plasmaHTCache.java
@ -254,6 +254,7 @@ public final class plasmaHTCache {
    }

    static void resetResponseHeaderDB() {
+        log.logFine("reset responseHeader DB with "+ responseHeaderDB.size() +" entries");
        if (responseHeaderDB != null) responseHeaderDB.close();
        final File dbfile = new File(cachePath, DB_NAME);
        if (dbfile.exists()) dbfile.delete();
@ -358,14 +359,18 @@ public final class plasmaHTCache {
    }
    
    public static boolean deleteURLfromCache(final yacyURL url) {
-        if (deleteFileandDirs(getCachePath(url), "FROM")) {
+        return deleteURLfromCache(url, false);
+    }
+    
+    public static boolean deleteURLfromCache(final yacyURL url, final boolean keepHeader) {
+        if (deleteFileandDirs(getCachePath(url), "FROM") && !keepHeader) {
            try {
                // As the file is gone, the entry in responseHeader.db is not needed anymore
                if (log.isFinest()) log.logFinest("Trying to remove responseHeader from URL: " + url.toNormalform(false, true));
                responseHeaderDB.remove(url.hash());
            } catch (final IOException e) {
                resetResponseHeaderDB();
-                log.logInfo("IOExeption removing response header from DB: " + e.getMessage(), e);
+                log.logWarning("IOExeption removing response header from DB: " + e.getMessage(), e);
            }
           return true;
       }
@ -904,7 +909,6 @@ public final class plasmaHTCache {
                initiator, 
                profile
        );
-        entry.writeResourceInfo();
        return entry;
    }

@ -936,7 +940,7 @@ public final class plasmaHTCache {
     */
    private final IResourceInfo            resInfo;

-    protected Entry clone() throws CloneNotSupportedException {
+    protected Entry clone() {
        return new Entry(
                this.initDate,
                this.depth,
@ -984,6 +988,8 @@ public final class plasmaHTCache {

        // to be defined later:
        this.cacheArray     = null;
+        
+        writeResourceInfo();
    }

    public String name() {
@ -1046,7 +1052,7 @@ public final class plasmaHTCache {
        return this.resInfo;
    }
    
-    boolean writeResourceInfo() {
+    private boolean writeResourceInfo() {
        if (this.resInfo == null) return false;
        try {
            final HashMap<String, String> hm = new HashMap<String, String>();
@ -1054,9 +1060,10 @@ public final class plasmaHTCache {
            hm.put("@@URL", this.url.toNormalform(false, false));
            hm.put("@@DEPTH", Integer.toString(this.depth));
            if (this.initiator != null) hm.put("@@INITIATOR", this.initiator);
-            getResponseHeaderDB().put(this.url.hash(), hm);
+            plasmaHTCache.getResponseHeaderDB().put(this.url.hash(), hm);
        } catch (final Exception e) {
-            resetResponseHeaderDB();
+            log.logWarning("could not write ResourceInfo: "+ e.getClass() +": "+ e.getMessage());
+            plasmaHTCache.resetResponseHeaderDB();
            return false;
        }
        return true;
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -977,6 +977,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
         * Testing if the content type is supported by the available parsers
         * ========================================================================= */
        final boolean isSupportedContent = plasmaParser.supportedContent(entry.url(),entry.getMimeType());
+        log.logFinest(entry.url() +" content of type "+ entry.getMimeType() +" is supported: "+ isSupportedContent);
        
        /* =========================================================================
         * INDEX CONTROL HEADER
--- a/source/de/anomic/yacy/yacyURL.java
+++ b/source/de/anomic/yacy/yacyURL.java
@ -732,7 +732,7 @@ public class yacyURL implements Serializable {
        final byte flagbyte = (byte) (((isHTTP) ? 0 : 32) | (id << 2) | domlengthKey);

        // combine the attributes
-        final StringBuffer hash = new StringBuffer(12);
+        final StringBuilder hash = new StringBuilder(12);
        // form the 'local' part of the hash
        hash.append(kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(toNormalform(true, true))).substring(0, 5)); // 5 chars
        hash.append(subdomPortPath(subdom, port, rootpath)); // 1 char
@ -741,7 +741,7 @@ public class yacyURL implements Serializable {
        hash.append(kelondroBase64Order.enhancedCoder.encodeByte(flagbyte)); // 1 char

        // return result hash
-        return new String(hash);
+        return hash.toString();
    }

    private static char subdomPortPath(final String subdom, final int port, final String rootpath) {