diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index c4ff62fad..8e1f21d7e 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -141,7 +141,7 @@ public class HostBrowser { sb.peers.mySeed().hash.getBytes(), url, null, load, new Date(), sb.crawler.defaultProxyProfile.handle(), - 0, 0, 0, 0 + 0, 0, 0 )); prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString)); if (wait) for (int i = 0; i < 30; i++) { diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 34f9759e4..d79613767 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -173,7 +173,6 @@ public class QuickCrawlLink_p { pe.handle(), 0, 0, - 0, 0 )); diff --git a/htroot/rct_p.java b/htroot/rct_p.java index 8c05077d5..f355f7d11 100644 --- a/htroot/rct_p.java +++ b/htroot/rct_p.java @@ -80,9 +80,7 @@ public class rct_p { sb.crawler.defaultRemoteProfile.handle(), 0, 0, - 0, - item.getSize() - )); + 0)); } else { env.getLog().warn("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason); } diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index 3d804caa3..2e3c5078c 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -57,6 +57,7 @@ import net.yacy.cora.protocol.TimeoutRequest; import net.yacy.cora.protocol.ftp.FTPClient; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.CommonPattern; +import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.parser.html.CharacterCoding; /** @@ -1353,7 +1354,7 @@ public class MultiProtocolURL implements Serializable, Comparable implements Map= 0 && entry.size() > maxFileSize) || - contentDomain == ContentDomain.APP || + if (contentDomain == ContentDomain.APP || (contentDomain == ContentDomain.IMAGE && TextParser.supportsExtension(entry.url()) != null) || contentDomain == ContentDomain.AUDIO || contentDomain == ContentDomain.VIDEO || diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index f16ffc19d..b6d4e8527 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -524,8 +524,7 @@ public class CrawlQueues { this.sb.crawler.defaultRemoteProfile.handle(), 0, 0, - 0, - item.getSize() + 0 )); } else { CrawlQueues.log.warn("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason); diff --git a/source/net/yacy/crawler/retrieval/Request.java b/source/net/yacy/crawler/retrieval/Request.java index 0a46d75c2..bcb5c6b24 100644 --- a/source/net/yacy/crawler/retrieval/Request.java +++ b/source/net/yacy/crawler/retrieval/Request.java @@ -92,7 +92,6 @@ public class Request extends WorkflowJob private int anchors; // number of anchors of the parent private int forkfactor; // sum of anchors of all ancestors private Bitfield flags; - private long size; // size of resource in bytes (if known) or 0 if not known private String statusMessage; private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection @@ -111,7 +110,6 @@ public class Request extends WorkflowJob this.statusMessage = null; this.initialHash = 0; this.status = 0; - this.size = 0; } /** @@ -121,7 +119,7 @@ public class Request extends WorkflowJob * @param referrerhash */ public Request(final DigestURL url, final byte[] referrerhash) { - this(null, url, referrerhash, null, null, null, 0, 0, 0, 0); + this(null, url, referrerhash, null, null, null, 0, 0, 0); } /** @@ -146,8 +144,7 @@ public class Request extends WorkflowJob final String profileHandle, final int depth, final int anchors, - final int forkfactor, - final long size) { + final int forkfactor) { // create new entry and store it into database assert url != null; assert profileHandle == null || profileHandle.length() == Word.commonHashLength : profileHandle @@ -167,7 +164,6 @@ public class Request extends WorkflowJob this.statusMessage = "loaded(args)"; this.initialHash = url.hashCode(); this.status = WorkflowJob.STATUS_INITIATED; - this.size = size; } public Request(final Row.Entry entry) throws IOException { @@ -195,7 +191,6 @@ public class Request extends WorkflowJob this.flags = new Bitfield(entry.getColBytes(10, true)); //this.loaddate = entry.getColLong(12); //this.lastmodified = entry.getColLong(13); - this.size = entry.getColLong(14); this.statusMessage = "loaded(kelondroRow.Entry)"; this.initialHash = this.url.hashCode(); } catch (final Throwable e ) { @@ -224,7 +219,6 @@ public class Request extends WorkflowJob final byte[] appdatestr = NaturalOrder.encodeLong(this.appdate, rowdef.width(5)); final byte[] loaddatestr = NaturalOrder.encodeLong(0 /*loaddate*/, rowdef.width(12)); final byte[] serverdatestr = NaturalOrder.encodeLong(0 /*lastmodified*/, rowdef.width(13)); - final byte[] sizestr = NaturalOrder.encodeLong(this.size, rowdef.width(14)); // store the hash in the hash cache final byte[] namebytes = UTF8.getBytes(this.name); final byte[][] entry = @@ -243,7 +237,7 @@ public class Request extends WorkflowJob NaturalOrder.encodeLong(0, rowdef.width(11)), loaddatestr, serverdatestr, - sizestr + new byte[0] // dummy, not used (any more) }; return rowdef.newEntry(entry); } @@ -277,27 +271,7 @@ public class Request extends WorkflowJob // the date when the url appeared first return new Date(this.appdate); } - - /* - public Date loaddate() { - // the date when the url was loaded - return new Date(this.loaddate); - } - - public Date lastmodified() { - // the date that the server returned as document date - return new Date(this.lastmodified); - } - */ - public long size() { - // the date that the client (browser) send as ifModifiedSince in proxy mode - return this.size; - } - public boolean isEmpty() { - return this.size == 0; - } - public String name() { // return the anchor name (text inside tag) return this.name; diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java index e6b859115..004af54a8 100644 --- a/source/net/yacy/crawler/retrieval/Response.java +++ b/source/net/yacy/crawler/retrieval/Response.java @@ -159,6 +159,9 @@ public class Response { this.status = QUEUE_STATE_FRESH; this.content = content; this.fromCache = fromCache; + if (this.responseHeader != null && content != null && Integer.parseInt(this.responseHeader.get(HeaderFramework.CONTENT_LENGTH, "0")) <= content.length) { + this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Integer.toString(content.length)); // repair length + } } /** @@ -173,11 +176,11 @@ public class Response { this.requestHeader = new RequestHeader(); this.responseHeader = new ResponseHeader(200); this.responseHeader.put(HeaderFramework.CONTENT_TYPE, Classification.ext2mime(MultiProtocolURL.getFileExtension(request.url().getFileName()), "text/plain")); // tell parser how to handle the content - if (!request.isEmpty()) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(request.size())); this.profile = profile; this.status = QUEUE_STATE_FRESH; this.content = request.name().length() > 0 ? UTF8.getBytes(request.name()) : UTF8.getBytes(request.url().toTokens()); this.fromCache = true; + if (this.responseHeader != null) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, "0"); // 'virtual' length, shows that the resource was not loaded } public Response( @@ -262,6 +265,9 @@ public class Response { public void setContent(final byte[] data) { this.content = data; + if (this.responseHeader != null && this.content != null && Integer.parseInt(this.responseHeader.get(HeaderFramework.CONTENT_LENGTH, "0")) <= content.length) { + this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Integer.toString(content.length)); // repair length + } } public byte[] getContent() { diff --git a/source/net/yacy/crawler/retrieval/SitemapImporter.java b/source/net/yacy/crawler/retrieval/SitemapImporter.java index ece007586..a2530a391 100644 --- a/source/net/yacy/crawler/retrieval/SitemapImporter.java +++ b/source/net/yacy/crawler/retrieval/SitemapImporter.java @@ -104,7 +104,6 @@ public class SitemapImporter extends Thread { this.crawlingProfile.handle(), 0, 0, - 0, 0 )); logger.info("New URL '" + entry.url() + "' added for loading."); diff --git a/source/net/yacy/data/ymark/YMarkCrawlStart.java b/source/net/yacy/data/ymark/YMarkCrawlStart.java index 753c77ff6..bbda8b5b7 100644 --- a/source/net/yacy/data/ymark/YMarkCrawlStart.java +++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java @@ -1,5 +1,5 @@ // YMarkCrawlStart.java -// (C) 2012 by Stefan Förster, sof@gmx.de, Norderstedt, Germany +// (C) 2012 by Stefan F��rster, sof@gmx.de, Norderstedt, Germany // first published 2011 on http://yacy.net // // This is a part of YaCy, a peer-to-peer based web search engine @@ -195,7 +195,7 @@ public class YMarkCrawlStart extends HashMap{ null, "CRAWLING-ROOT", new Date(), - pe.handle(), 0, 0, 0, 0 + pe.handle(), 0, 0, 0 )); } } diff --git a/source/net/yacy/http/ProxyCacheHandler.java b/source/net/yacy/http/ProxyCacheHandler.java index 4a303effc..660463d3a 100644 --- a/source/net/yacy/http/ProxyCacheHandler.java +++ b/source/net/yacy/http/ProxyCacheHandler.java @@ -76,7 +76,6 @@ public class ProxyCacheHandler extends AbstractRemoteHandler implements Handler sb.crawler.defaultProxyProfile.handle(), 0, 0, - 0, 0); final Response cachedResponse = new Response( diff --git a/source/net/yacy/http/ProxyHandler.java b/source/net/yacy/http/ProxyHandler.java index ff3d6f6c3..1b97d11d8 100644 --- a/source/net/yacy/http/ProxyHandler.java +++ b/source/net/yacy/http/ProxyHandler.java @@ -189,7 +189,6 @@ public class ProxyHandler extends AbstractRemoteHandler implements Handler { sb.crawler.defaultProxyProfile.handle(), 0, 0, - 0, 0); //sizeBeforeDelete < 0 ? 0 : sizeBeforeDelete); final Response yacyResponse = new Response( yacyRequest, @@ -251,7 +250,7 @@ public class ProxyHandler extends AbstractRemoteHandler implements Handler { * adds specific header elements for the connection of the internal * httpclient to the remote server according to local config * - * @param header header für http client (already preset with headers from + * @param header header f��r http client (already preset with headers from * original ServletRequest * @param origServletRequest original request/header */ diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 668ebb2a5..f31c6e551 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -128,7 +128,6 @@ public final class LoaderDispatcher { this.sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile 0, 0, - 0, 0); } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index b5589d18a..2e58a8885 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -199,7 +199,6 @@ import net.yacy.search.ranking.RankingProfile; import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.WebgraphConfiguration; -import net.yacy.search.schema.WebgraphSchema; import net.yacy.server.serverCore; import net.yacy.server.serverSwitch; import net.yacy.server.http.RobotsTxtConfig; @@ -1926,7 +1925,6 @@ public final class Switchboard extends serverSwitch { this.crawler.defaultSurrogateProfile.handle(), 0, 0, - 0, 0); response = new Response(request, null, null, this.crawler.defaultSurrogateProfile, false); final IndexingQueueEntry queueEntry = @@ -2634,8 +2632,7 @@ public final class Switchboard extends serverSwitch { response.profile().handle(), response.depth() + 1, 0, - 0, - response.size() < 0 ? 0 : response.size())); + 0)); } catch (final MalformedURLException e ) { ConcurrentLog.logException(e); } @@ -3013,7 +3010,6 @@ public final class Switchboard extends serverSwitch { profile.handle(), 0, 0, - 0, 0 )); diff --git a/source/net/yacy/server/http/HTTPDProxyHandler.java b/source/net/yacy/server/http/HTTPDProxyHandler.java index f48bc4eb6..e50f51c93 100644 --- a/source/net/yacy/server/http/HTTPDProxyHandler.java +++ b/source/net/yacy/server/http/HTTPDProxyHandler.java @@ -380,7 +380,6 @@ public final class HTTPDProxyHandler { sb.crawler.defaultProxyProfile.handle(), 0, 0, - 0, 0); final Response response = new Response( request, @@ -509,8 +508,7 @@ public final class HTTPDProxyHandler { sb.crawler.defaultProxyProfile.handle(), 0, 0, - 0, - sizeBeforeDelete < 0 ? 0 : sizeBeforeDelete); + 0); // handle incoming cookies