From a087090bbb9447a362d3a438893aee2b72262e70 Mon Sep 17 00:00:00 2001 From: danielr Date: Sun, 10 Aug 2008 11:31:40 +0000 Subject: [PATCH] fixed starting crawl results in "No parser available to parse mimetype 'application/octet-stream'" git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5047 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/crawler/HTTPLoader.java | 5 +++-- .../plasma/cache/http/ResourceInfo.java | 4 ++-- source/de/anomic/plasma/plasmaHTCache.java | 21 ++++++++++++------- .../de/anomic/plasma/plasmaSwitchboard.java | 1 + source/de/anomic/yacy/yacyURL.java | 4 ++-- 5 files changed, 22 insertions(+), 13 deletions(-) diff --git a/source/de/anomic/crawler/HTTPLoader.java b/source/de/anomic/crawler/HTTPLoader.java index 603345ae3..7c1a28b98 100644 --- a/source/de/anomic/crawler/HTTPLoader.java +++ b/source/de/anomic/crawler/HTTPLoader.java @@ -187,7 +187,8 @@ public final class HTTPLoader { if (plasmaParser.supportedContent(parserMode, entry.url(), res.getResponseHeader().mime())) { // delete old content if (cacheFile.isFile()) { - plasmaHTCache.deleteURLfromCache(entry.url()); + // TODO why is content and metadata so separated? htcache holds metadata, but deleteURLfromCache deletes it??? + plasmaHTCache.deleteURLfromCache(entry.url(), true); } // create parent directories @@ -217,7 +218,7 @@ public final class HTTPLoader { } // we write the new cache entry to file system directly - (res).setAccountingName("CRAWLER"); + res.setAccountingName("CRAWLER"); final byte[] responseBody = res.getData(); fos.write(responseBody); htCache.setCacheArray(responseBody); diff --git a/source/de/anomic/plasma/cache/http/ResourceInfo.java b/source/de/anomic/plasma/cache/http/ResourceInfo.java index f3d9d5069..c8bc27b1e 100644 --- a/source/de/anomic/plasma/cache/http/ResourceInfo.java +++ b/source/de/anomic/plasma/cache/http/ResourceInfo.java @@ -59,8 +59,8 @@ public class ResourceInfo implements IResourceInfo { } public ResourceInfo(final yacyURL objectURL, final httpHeader requestHeaders, final httpHeader responseHeaders) { - if (objectURL == null) throw new NullPointerException(); - if (responseHeaders == null) throw new NullPointerException(); + if (objectURL == null) throw new NullPointerException("objectURL == null"); + if (responseHeaders == null) throw new NullPointerException("responseHeader == null"); // generating the url hash this.url = objectURL; diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index af8844662..0fa85d47f 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -254,6 +254,7 @@ public final class plasmaHTCache { } static void resetResponseHeaderDB() { + log.logFine("reset responseHeader DB with "+ responseHeaderDB.size() +" entries"); if (responseHeaderDB != null) responseHeaderDB.close(); final File dbfile = new File(cachePath, DB_NAME); if (dbfile.exists()) dbfile.delete(); @@ -358,14 +359,18 @@ public final class plasmaHTCache { } public static boolean deleteURLfromCache(final yacyURL url) { - if (deleteFileandDirs(getCachePath(url), "FROM")) { + return deleteURLfromCache(url, false); + } + + public static boolean deleteURLfromCache(final yacyURL url, final boolean keepHeader) { + if (deleteFileandDirs(getCachePath(url), "FROM") && !keepHeader) { try { // As the file is gone, the entry in responseHeader.db is not needed anymore if (log.isFinest()) log.logFinest("Trying to remove responseHeader from URL: " + url.toNormalform(false, true)); responseHeaderDB.remove(url.hash()); } catch (final IOException e) { resetResponseHeaderDB(); - log.logInfo("IOExeption removing response header from DB: " + e.getMessage(), e); + log.logWarning("IOExeption removing response header from DB: " + e.getMessage(), e); } return true; } @@ -904,7 +909,6 @@ public final class plasmaHTCache { initiator, profile ); - entry.writeResourceInfo(); return entry; } @@ -936,7 +940,7 @@ public final class plasmaHTCache { */ private final IResourceInfo resInfo; - protected Entry clone() throws CloneNotSupportedException { + protected Entry clone() { return new Entry( this.initDate, this.depth, @@ -984,6 +988,8 @@ public final class plasmaHTCache { // to be defined later: this.cacheArray = null; + + writeResourceInfo(); } public String name() { @@ -1046,7 +1052,7 @@ public final class plasmaHTCache { return this.resInfo; } - boolean writeResourceInfo() { + private boolean writeResourceInfo() { if (this.resInfo == null) return false; try { final HashMap hm = new HashMap(); @@ -1054,9 +1060,10 @@ public final class plasmaHTCache { hm.put("@@URL", this.url.toNormalform(false, false)); hm.put("@@DEPTH", Integer.toString(this.depth)); if (this.initiator != null) hm.put("@@INITIATOR", this.initiator); - getResponseHeaderDB().put(this.url.hash(), hm); + plasmaHTCache.getResponseHeaderDB().put(this.url.hash(), hm); } catch (final Exception e) { - resetResponseHeaderDB(); + log.logWarning("could not write ResourceInfo: "+ e.getClass() +": "+ e.getMessage()); + plasmaHTCache.resetResponseHeaderDB(); return false; } return true; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 1d64cde61..452b12bef 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -977,6 +977,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch