diff --git a/source/de/anomic/crawler/CrawlEntry.java b/source/de/anomic/crawler/CrawlEntry.java index 0bbab332b..07afa0e13 100755 --- a/source/de/anomic/crawler/CrawlEntry.java +++ b/source/de/anomic/crawler/CrawlEntry.java @@ -104,7 +104,7 @@ public class CrawlEntry { assert appdate != null; assert url != null; assert initiator != null; - assert initiator.length() > 0; + assert initiator.length() > 0 : "initiator of '"+ url +"' is empty"; assert referrerhash != null; this.initiator = initiator; this.url = url; diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index b8903aa27..f0fb5dbed 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -220,6 +220,9 @@ public final class CrawlStacker extends Thread { final CrawlProfile.entry profile) { if (profile == null) return; + // DEBUG + log.logFinest("ENQUEUE "+ nexturl +", referer="+referrerhash +", initiator="+initiatorHash +", name="+name +", load="+loadDate +", depth="+currentdepth); + // check first before we create a big object if (this.urlEntryCache.has(nexturl.hash().getBytes())) return; diff --git a/source/de/anomic/crawler/IndexingStack.java b/source/de/anomic/crawler/IndexingStack.java index bf8b93b59..8f87f47c0 100644 --- a/source/de/anomic/crawler/IndexingStack.java +++ b/source/de/anomic/crawler/IndexingStack.java @@ -625,5 +625,14 @@ public class IndexingStack { return null; } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#toString() + */ + public String toString() { + return "QueueEntry of "+ url.toString() + ", ref="+referrerHash +", initiator="+initiator +", flags="+ flags +", anchor="+ anchorName; + } } // class Entry } \ No newline at end of file diff --git a/source/de/anomic/http/MultiOutputStream.java b/source/de/anomic/http/MultiOutputStream.java new file mode 100644 index 000000000..893b4eb89 --- /dev/null +++ b/source/de/anomic/http/MultiOutputStream.java @@ -0,0 +1,44 @@ +/** + * MultiOutputStream.java + * @since 26.08.2008 + */ +package de.anomic.http; + +import java.io.IOException; +import java.io.OutputStream; + +/** + * writes to multiple {link OutputStream}s (parallel) + * + * @author daniel + * + */ +class MultiOutputStream extends OutputStream { + + private final OutputStream[] streams; + + /** + * creates a new MultiOutputStream + * + * @param streams + */ + public MultiOutputStream(final OutputStream[] streams) { + super(); + // make a copy to avoid external modifications + this.streams = new OutputStream[streams.length]; + System.arraycopy(streams, 0, this.streams, 0, streams.length); + } + + /** + * writes the byte to each of the streams + * + * @see java.io.OutputStream#write(int) + */ + @Override + public void write(int b) throws IOException { + for(OutputStream stream: streams) { + stream.write(b); + } + } + +} diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index efb63dce3..84863960a 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -55,7 +55,6 @@ import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; -import java.io.UnsupportedEncodingException; import java.io.Writer; import java.net.BindException; import java.net.ConnectException; @@ -71,6 +70,7 @@ import java.util.Date; import java.util.HashSet; import java.util.Iterator; import java.util.Properties; +import java.util.Set; import java.util.logging.FileHandler; import java.util.logging.Level; import java.util.logging.LogManager; @@ -177,7 +177,10 @@ public final class httpdProxyHandler { // create a htRootPath: system pages htRootPath = new File(switchboard.getRootPath(), switchboard.getConfig("htRootPath","htroot")); - if (!(htRootPath.exists())) htRootPath.mkdir(); + if (!(htRootPath.exists())) { + if(!htRootPath.mkdir()) + serverLog.logSevere("PROXY", "could not create htRoot "+ htRootPath); + } // load a transformer transformer = new htmlFilterContentTransformer(); @@ -224,6 +227,27 @@ public final class httpdProxyHandler { */ private static final StringBuffer userAgentStr = new StringBuffer(); + /** + * A Set of media types which are known to only contain binary data (no readable text) + * Each is only the first part of the content-type field (no subtypes) + */ + private static final Set binaryTypes = new HashSet(); + + /** + * A Set of content-types which are known to only contain binary data (no readable text) + * Each is a complete content-type header field (without parameters) + */ + private static final Set binaryContent = new HashSet(); + static { + // all Strings must be lower case!! + // RFC 2045: "Matching of media type and subtype is ALWAYS case-insensitive." + // discrete types + binaryTypes.add("image"); + binaryTypes.add("audio"); + binaryTypes.add("video"); + + binaryContent.add("application/octet-stream"); + } public static void handleOutgoingCookies(final httpRequestHeader requestHeader, final String targethost, final String clienthost) { /* @@ -449,7 +473,7 @@ public final class httpdProxyHandler { private static void fulfillRequestFromWeb(final Properties conProp, final yacyURL url,final String ext, final httpRequestHeader requestHeader, final httpResponseHeader cachedResponseHeader, final File cacheFile, final OutputStream respond) { final GZIPOutputStream gzippedOut = null; - Writer hfos = null; + Writer textOutput = null; JakartaCommonsHttpResponse res = null; try { @@ -532,19 +556,25 @@ public final class httpdProxyHandler { // handle file types and make (possibly transforming) output stream final OutputStream outStream = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond); - if ( - (!transformer.isIdentityTransformer()) && - (plasmaParser.supportedHTMLContent(url,responseHeader.mime())) - ) { - // make a transformer - theLogger.logFine(reqID +" create transformer for URL " + url); - //hfos = new htmlFilterOutputStream((gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond), null, transformer, (ext.length() == 0)); - final Charset charSet = responseHeader.getCharSet(); - hfos = new htmlFilterWriter(outStream,charSet, null, transformer, (ext.length() == 0)); + final boolean isBinary = isBinary(responseHeader); + if(isBinary) { + theLogger.logFine(reqID +" create direct passthrough for URL " + url + ", extension '" + ext + "', mime-type '" + responseHeader.mime() + "'"); } else { - // simply pass through without parsing - theLogger.logFine(reqID +" create passthrough for URL " + url + ", extension '" + ext + "', mime-type '" + responseHeader.mime() + "'"); - hfos = new OutputStreamWriter(outStream, responseHeader.getCharSet()); + // handle text stuff (encoding and so on) + if ( + (!transformer.isIdentityTransformer()) && + (plasmaParser.supportedHTMLContent(url,responseHeader.mime())) + ) { + // make a transformer + theLogger.logFine(reqID +" create transformer for URL " + url); + //hfos = new htmlFilterOutputStream((gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond), null, transformer, (ext.length() == 0)); + final Charset charSet = responseHeader.getCharSet(); + textOutput = new htmlFilterWriter(outStream,charSet, null, transformer, (ext.length() == 0)); + } else { + // simply pass through without parsing + theLogger.logFine(reqID +" create text passthrough for URL " + url + ", extension '" + ext + "', mime-type '" + responseHeader.mime() + "'"); + textOutput = new OutputStreamWriter(outStream, responseHeader.getCharSet()); + } } // handle incoming cookies @@ -587,7 +617,12 @@ public final class httpdProxyHandler { { // ok, we don't write actually into a file, only to RAM, and schedule writing the file. final ByteArrayOutputStream byteStream = new ByteArrayOutputStream(); - writeContent(res, new BufferedWriter(hfos), byteStream); + if(isBinary) { + final OutputStream toClientAndMemory = new MultiOutputStream(new OutputStream[] {outStream, byteStream}); + serverFileUtils.copy(res.getDataAsStream(), toClientAndMemory); + } else { + writeTextContent(res, new BufferedWriter(textOutput), byteStream); + } // cached bytes byte[] cacheArray; if(byteStream.size() > 0) { @@ -597,7 +632,7 @@ public final class httpdProxyHandler { } theLogger.logFine(reqID +" writeContent of " + url + " produced cacheArray = " + ((cacheArray == null) ? "null" : ("size=" + cacheArray.length))); - if (hfos instanceof htmlFilterWriter) ((htmlFilterWriter) hfos).close(); + if (textOutput instanceof htmlFilterWriter) ((htmlFilterWriter) textOutput).close(); if (sizeBeforeDelete == -1) { // totally fresh file @@ -622,8 +657,14 @@ public final class httpdProxyHandler { // the file is too big to cache it in the ram, or the size is unknown // write to file right here. cacheFile.getParentFile().mkdirs(); - writeContent(res, new BufferedWriter(hfos), new FileOutputStream(cacheFile)); - if (hfos instanceof htmlFilterWriter) ((htmlFilterWriter) hfos).close(); + final OutputStream fileStream = new FileOutputStream(cacheFile); + if(isBinary) { + OutputStream toClientAndFile = new MultiOutputStream(new OutputStream[] {outStream, fileStream}); + serverFileUtils.copy(res.getDataAsStream(), toClientAndFile); + } else { + writeTextContent(res, new BufferedWriter(textOutput), fileStream); + } + if (textOutput instanceof htmlFilterWriter) ((htmlFilterWriter) textOutput).close(); theLogger.logFine(reqID +" for write-file of " + url + ": contentLength = " + contentLength + ", sizeBeforeDelete = " + sizeBeforeDelete); plasmaHTCache.writeFileAnnouncement(cacheFile); if (sizeBeforeDelete == -1) { @@ -652,8 +693,14 @@ public final class httpdProxyHandler { " StoreHTCache=" + storeHTCache + " SupportetContent=" + isSupportedContent); - writeContent(res, new BufferedWriter(hfos)); - if (hfos instanceof htmlFilterWriter) ((htmlFilterWriter) hfos).close(); + if(isBinary) { + // directly pass bytes to client + serverFileUtils.copy(res.getDataAsStream(), outStream); + } else { + // read data with specified encoding and send it as character stream + writeTextContent(res, new BufferedWriter(textOutput)); + } + if (textOutput instanceof htmlFilterWriter) ((htmlFilterWriter) textOutput).close(); /*if (sizeBeforeDelete == -1) { // no old file and no load. just data passing //cacheEntry.status = plasmaHTCache.CACHE_PASSING; @@ -701,7 +748,7 @@ public final class httpdProxyHandler { final httpChunkedOutputStream chunkedOut = null; final GZIPOutputStream gzippedOut = null; - Object hfos = null; + Writer textOutput = null; // we respond on the request by using the cache, the cache is fresh try { @@ -754,20 +801,19 @@ public final class httpdProxyHandler { if (( !transformer.isIdentityTransformer()) && (ext == null || !plasmaParser.supportedHTMLFileExtContains(url)) && (plasmaParser.HTMLParsableMimeTypesContains(cachedResponseHeader.mime()))) { - hfos = new htmlFilterWriter(outStream, charSet, null, transformer, (ext == null || ext.length() == 0)); - } else { - hfos = outStream; + textOutput = new htmlFilterWriter(outStream, charSet, null, transformer, (ext == null || ext.length() == 0)); } // send also the complete body now from the cache // simply read the file and transfer to out socket - if (hfos instanceof OutputStream) { - serverFileUtils.copy(cacheFile,(OutputStream)hfos); - } else if (hfos instanceof Writer) { - serverFileUtils.copy(cacheFile,charSet,(Writer)hfos); + if(textOutput != null && !isBinary(cachedResponseHeader)) { + // send as encoded text + serverFileUtils.copy(cacheFile, charSet, textOutput); + } else { + serverFileUtils.copy(cacheFile, outStream); } - if (hfos instanceof htmlFilterWriter) ((htmlFilterWriter) hfos).close(); + if (textOutput != null) textOutput.close(); if (gzippedOut != null) gzippedOut.finish(); if (chunkedOut != null) chunkedOut.finish(); } @@ -787,24 +833,63 @@ public final class httpdProxyHandler { return; } - public static void writeContent(final JakartaCommonsHttpResponse res, final BufferedWriter hfos) throws IOException, UnsupportedEncodingException { + /** + * determines if the body is text or not + * + * @param responseHeader + * @return + */ + private static boolean isBinary(httpResponseHeader responseHeader) { + String mime = responseHeader.mime().toLowerCase(); + if(mime.contains(";")) { + // cut of parameters + mime = mime.substring(0, mime.indexOf(';')); + } + // mime and the contents of the Set must be lower case! + if(binaryContent.contains(mime)) { + return true; + } + final int endType = mime.contains("/") ? mime.indexOf('/') : mime.length(); + final String type = mime.substring(0, endType); + if(binaryTypes.contains(type)) { + return true; + } + return false; + } + + /** + * ready the body of res with charSet and write it to output + * + * @param res + * @param output + * @throws IOException + */ + public static void writeTextContent(final JakartaCommonsHttpResponse res, final BufferedWriter output) throws IOException { try { final InputStream data = res.getDataAsStream(); if (data == null) return; final Charset charSet = res.getResponseHeader().getCharSet(); - serverFileUtils.copyToWriter(new BufferedInputStream(data), hfos, charSet); + serverFileUtils.copyToWriter(new BufferedInputStream(data), output, charSet); } finally { res.closeStream(); } } - public static void writeContent(final JakartaCommonsHttpResponse res, final BufferedWriter hfos, final OutputStream byteStream) throws IOException, UnsupportedEncodingException { + /** + * ready the body of res with charSet and write it to output and parallel encoded with charSet to byteStream + * + * @param res + * @param output + * @param byteStream + * @throws IOException + */ + public static void writeTextContent(final JakartaCommonsHttpResponse res, final BufferedWriter output, final OutputStream byteStream) throws IOException { assert byteStream != null; try { final InputStream data = res.getDataAsStream(); if (data == null) return; final Charset charSet = res.getResponseHeader().getCharSet(); - serverFileUtils.copyToWriters(new BufferedInputStream(data), hfos, new BufferedWriter(new OutputStreamWriter(byteStream, charSet)) , charSet); + serverFileUtils.copyToWriters(new BufferedInputStream(data), output, new BufferedWriter(new OutputStreamWriter(byteStream, charSet)) , charSet); } finally { res.closeStream(); } @@ -1031,7 +1116,7 @@ public final class httpdProxyHandler { } if (chunked != null) chunked.finish(); */ - writeContent(res, new BufferedWriter(new OutputStreamWriter((chunked != null) ? chunked : countedRespond))); + writeTextContent(res, new BufferedWriter(new OutputStreamWriter((chunked != null) ? chunked : countedRespond))); countedRespond.flush(); } finally { diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 50dac3278..eaf09556e 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -980,7 +980,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch