fixed starting crawl results in "No parser available to parse mimetype 'application/octet-stream'"

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5047 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
danielr 17 years ago
parent 7e7e6a099a
commit a087090bbb

@ -187,7 +187,8 @@ public final class HTTPLoader {
if (plasmaParser.supportedContent(parserMode, entry.url(), res.getResponseHeader().mime())) {
// delete old content
if (cacheFile.isFile()) {
plasmaHTCache.deleteURLfromCache(entry.url());
// TODO why is content and metadata so separated? htcache holds metadata, but deleteURLfromCache deletes it???
plasmaHTCache.deleteURLfromCache(entry.url(), true);
}
// create parent directories
@ -217,7 +218,7 @@ public final class HTTPLoader {
}
// we write the new cache entry to file system directly
(res).setAccountingName("CRAWLER");
res.setAccountingName("CRAWLER");
final byte[] responseBody = res.getData();
fos.write(responseBody);
htCache.setCacheArray(responseBody);

@ -59,8 +59,8 @@ public class ResourceInfo implements IResourceInfo {
}
public ResourceInfo(final yacyURL objectURL, final httpHeader requestHeaders, final httpHeader responseHeaders) {
if (objectURL == null) throw new NullPointerException();
if (responseHeaders == null) throw new NullPointerException();
if (objectURL == null) throw new NullPointerException("objectURL == null");
if (responseHeaders == null) throw new NullPointerException("responseHeader == null");
// generating the url hash
this.url = objectURL;

@ -254,6 +254,7 @@ public final class plasmaHTCache {
}
static void resetResponseHeaderDB() {
log.logFine("reset responseHeader DB with "+ responseHeaderDB.size() +" entries");
if (responseHeaderDB != null) responseHeaderDB.close();
final File dbfile = new File(cachePath, DB_NAME);
if (dbfile.exists()) dbfile.delete();
@ -358,14 +359,18 @@ public final class plasmaHTCache {
}
public static boolean deleteURLfromCache(final yacyURL url) {
if (deleteFileandDirs(getCachePath(url), "FROM")) {
return deleteURLfromCache(url, false);
}
public static boolean deleteURLfromCache(final yacyURL url, final boolean keepHeader) {
if (deleteFileandDirs(getCachePath(url), "FROM") && !keepHeader) {
try {
// As the file is gone, the entry in responseHeader.db is not needed anymore
if (log.isFinest()) log.logFinest("Trying to remove responseHeader from URL: " + url.toNormalform(false, true));
responseHeaderDB.remove(url.hash());
} catch (final IOException e) {
resetResponseHeaderDB();
log.logInfo("IOExeption removing response header from DB: " + e.getMessage(), e);
log.logWarning("IOExeption removing response header from DB: " + e.getMessage(), e);
}
return true;
}
@ -904,7 +909,6 @@ public final class plasmaHTCache {
initiator,
profile
);
entry.writeResourceInfo();
return entry;
}
@ -936,7 +940,7 @@ public final class plasmaHTCache {
*/
private final IResourceInfo resInfo;
protected Entry clone() throws CloneNotSupportedException {
protected Entry clone() {
return new Entry(
this.initDate,
this.depth,
@ -984,6 +988,8 @@ public final class plasmaHTCache {
// to be defined later:
this.cacheArray = null;
writeResourceInfo();
}
public String name() {
@ -1046,7 +1052,7 @@ public final class plasmaHTCache {
return this.resInfo;
}
boolean writeResourceInfo() {
private boolean writeResourceInfo() {
if (this.resInfo == null) return false;
try {
final HashMap<String, String> hm = new HashMap<String, String>();
@ -1054,9 +1060,10 @@ public final class plasmaHTCache {
hm.put("@@URL", this.url.toNormalform(false, false));
hm.put("@@DEPTH", Integer.toString(this.depth));
if (this.initiator != null) hm.put("@@INITIATOR", this.initiator);
getResponseHeaderDB().put(this.url.hash(), hm);
plasmaHTCache.getResponseHeaderDB().put(this.url.hash(), hm);
} catch (final Exception e) {
resetResponseHeaderDB();
log.logWarning("could not write ResourceInfo: "+ e.getClass() +": "+ e.getMessage());
plasmaHTCache.resetResponseHeaderDB();
return false;
}
return true;

@ -977,6 +977,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
* Testing if the content type is supported by the available parsers
* ========================================================================= */
final boolean isSupportedContent = plasmaParser.supportedContent(entry.url(),entry.getMimeType());
log.logFinest(entry.url() +" content of type "+ entry.getMimeType() +" is supported: "+ isSupportedContent);
/* =========================================================================
* INDEX CONTROL HEADER

@ -732,7 +732,7 @@ public class yacyURL implements Serializable {
final byte flagbyte = (byte) (((isHTTP) ? 0 : 32) | (id << 2) | domlengthKey);
// combine the attributes
final StringBuffer hash = new StringBuffer(12);
final StringBuilder hash = new StringBuilder(12);
// form the 'local' part of the hash
hash.append(kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(toNormalform(true, true))).substring(0, 5)); // 5 chars
hash.append(subdomPortPath(subdom, port, rootpath)); // 1 char
@ -741,7 +741,7 @@ public class yacyURL implements Serializable {
hash.append(kelondroBase64Order.enhancedCoder.encodeByte(flagbyte)); // 1 char
// return result hash
return new String(hash);
return hash.toString();
}
private static char subdomPortPath(final String subdom, final int port, final String rootpath) {

Loading…
Cancel
Save