less byte-arrays of response-content, less byte-array <-> stream conversation

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7856 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
sixcooler 13 years ago
parent 59b767eebd
commit ce248cc8dd

@ -26,7 +26,6 @@
package de.anomic.crawler.retrieval;
import java.io.ByteArrayInputStream;
import java.util.Date;
import net.yacy.cora.date.GenericFormatter;
@ -63,7 +62,7 @@ public class Response {
private final RequestHeader requestHeader;
private final ResponseHeader responseHeader;
private final String responseStatus;
private final CrawlProfile profile;
private final CrawlProfile profile;
private byte[] content;
private int status; // tracker indexing status, see status defs below
@ -824,7 +823,7 @@ public class Response {
String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime());
if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
try {
return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content.length, new ByteArrayInputStream(this.content));
return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content);
} catch (Exception e) {
return null;
}

@ -166,69 +166,11 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
return;
}
/* ===========================================================================
* LOAD RESOURCE DATA
* =========================================================================== */
// if the snippet is not in the cache, we can try to get it from the htcache
final Response response;
try {
// first try to get the snippet from metadata
String loc;
final boolean noCacheUsage = url.isFile() || url.isSMB() || cacheStrategy == null;
if (containsAllHashes(loc = comp.dc_title(), queryhashes)) {
// try to create the snippet from information given in the url itself
init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
return;
} else if (containsAllHashes(loc = comp.dc_creator(), queryhashes)) {
// try to create the snippet from information given in the creator metadata
init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
return;
} else if (containsAllHashes(loc = comp.dc_subject(), queryhashes)) {
// try to create the snippet from information given in the subject metadata
init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
return;
} else if (containsAllHashes(loc = comp.url().toNormalform(true, true).replace('-', ' '), queryhashes)) {
// try to create the snippet from information given in the url
init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
return;
} else {
// try to load the resource from the cache
response = loader == null ? null : loader.load(loader.request(url, true, reindexing), noCacheUsage ? CacheStrategy.NOCACHE : cacheStrategy, Long.MAX_VALUE, true);
if (response == null) {
// in case that we did not get any result we can still return a success when we are not allowed to go online
if (cacheStrategy == null || cacheStrategy.mustBeOffline()) {
init(url.hash(), null, ResultClass.ERROR_SOURCE_LOADING, "omitted network load (not allowed), no cache entry");
return;
}
// if it is still not available, report an error
init(url.hash(), null, ResultClass.ERROR_RESOURCE_LOADING, "error loading resource from net, no cache entry");
return;
} else {
// place entry on indexing queue
Switchboard.getSwitchboard().toIndexer(response);
source = ResultClass.SOURCE_WEB;
}
}
} catch (final Exception e) {
//Log.logException(e);
init(url.hash(), null, ResultClass.ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage());
return;
}
/* ===========================================================================
* PARSE RESOURCE
* =========================================================================== */
Document document = null;
try {
document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
} catch (final Parser.Failure e) {
init(url.hash(), null, ResultClass.ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
return;
}
Document document = loadDocument(loader, comp, queryhashes, cacheStrategy, url, reindexing, source);
if (document == null) {
init(url.hash(), null, ResultClass.ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
if (this.error == null) {
init(url.hash(), null, ResultClass.ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
}
return;
}
@ -281,6 +223,77 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
document.close();
init(url.hash(), snippetLine, source, null);
}
private Document loadDocument(
final LoaderDispatcher loader,
final URIMetadataRow.Components comp,
final HandleSet queryhashes,
final CacheStrategy cacheStrategy,
final DigestURI url,
final boolean reindexing,
ResultClass source) {
/* ===========================================================================
* LOAD RESOURCE DATA
* =========================================================================== */
// if the snippet is not in the cache, we can try to get it from the htcache
final Response response;
try {
// first try to get the snippet from metadata
String loc;
final boolean noCacheUsage = url.isFile() || url.isSMB() || cacheStrategy == null;
if (containsAllHashes(loc = comp.dc_title(), queryhashes)) {
// try to create the snippet from information given in the url itself
init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
return null;
} else if (containsAllHashes(loc = comp.dc_creator(), queryhashes)) {
// try to create the snippet from information given in the creator metadata
init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
return null;
} else if (containsAllHashes(loc = comp.dc_subject(), queryhashes)) {
// try to create the snippet from information given in the subject metadata
init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
return null;
} else if (containsAllHashes(loc = comp.url().toNormalform(true, true).replace('-', ' '), queryhashes)) {
// try to create the snippet from information given in the url
init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
return null;
} else {
// try to load the resource from the cache
response = loader == null ? null : loader.load(loader.request(url, true, reindexing), noCacheUsage ? CacheStrategy.NOCACHE : cacheStrategy, Integer.MAX_VALUE, true);
if (response == null) {
// in case that we did not get any result we can still return a success when we are not allowed to go online
if (cacheStrategy == null || cacheStrategy.mustBeOffline()) {
init(url.hash(), null, ResultClass.ERROR_SOURCE_LOADING, "omitted network load (not allowed), no cache entry");
return null;
}
// if it is still not available, report an error
init(url.hash(), null, ResultClass.ERROR_RESOURCE_LOADING, "error loading resource from net, no cache entry");
return null;
} else {
// place entry on indexing queue
Switchboard.getSwitchboard().toIndexer(response);
source = ResultClass.SOURCE_WEB;
}
}
} catch (final Exception e) {
//Log.logException(e);
init(url.hash(), null, ResultClass.ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage());
return null;
}
/* ===========================================================================
* PARSE RESOURCE
* =========================================================================== */
Document document = null;
try {
document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
} catch (final Parser.Failure e) {
init(url.hash(), null, ResultClass.ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
return null;
}
return document;
}
private void init(final byte[] urlhash, final String line, final ResultClass errorCode, final String errortext) {
this.urlhash = urlhash;

@ -169,11 +169,23 @@ public final class TextParser {
public static Document[] parseSource(
final MultiProtocolURI location,
final String mimeType,
String mimeType,
final String charset,
final byte[] content
) throws Parser.Failure {
return parseSource(location, mimeType, charset, content.length, new ByteArrayInputStream(content));
if (log.isFine()) log.logFine("Parsing '" + location + "' from byte-array");
mimeType = normalizeMimeType(mimeType);
List<Parser> idioms = null;
try {
idioms = parsers(location, mimeType);
} catch (final Parser.Failure e) {
final String errorMsg = "Parser Failure for extension '" + location.getFileExtension() + "' or mimetype '" + mimeType + "': " + e.getMessage();
log.logWarning(errorMsg);
throw new Parser.Failure(errorMsg, location);
}
assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true, false);
return parseSource(location, mimeType, idioms, charset, content);
}
public static Document[] parseSource(
@ -199,9 +211,7 @@ public final class TextParser {
// then we use only one stream-oriented parser.
if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) {
// use a specific stream-oriented parser
final Document[] docs = parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream);
for (final Document d: docs) { assert d.getText() != null; } // verify docs
return docs;
return parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream);
}
// in case that we know more parsers we first transform the content into a byte[] and use that as base
@ -212,9 +222,7 @@ public final class TextParser {
} catch (final IOException e) {
throw new Parser.Failure(e.getMessage(), location);
}
final Document[] docs = parseSource(location, mimeType, idioms, charset, b);
for (final Document d: docs) { assert d.getText() != null; } // verify docs
return docs;
return parseSource(location, mimeType, idioms, charset, b);
}
private static Document[] parseSource(
@ -254,7 +262,7 @@ public final class TextParser {
Document[] docs = null;
final HashMap<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>();
if (MemoryControl.request(sourceArray.length * 2, false)) {
if (MemoryControl.request(sourceArray.length * 6, false)) {
for (final Parser parser: parsers) {
try {
docs = parser.parse(location, mimeType, documentCharset, new ByteArrayInputStream(sourceArray));

Loading…
Cancel
Save