fix for bad/missing values of size_i

pull/1/head
Michael Peter Christen 11 years ago
parent 6306d28a6a
commit b08375da33

@ -141,7 +141,7 @@ public class HostBrowser {
sb.peers.mySeed().hash.getBytes(), sb.peers.mySeed().hash.getBytes(),
url, null, load, new Date(), url, null, load, new Date(),
sb.crawler.defaultProxyProfile.handle(), sb.crawler.defaultProxyProfile.handle(),
0, 0, 0, 0 0, 0, 0
)); ));
prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString)); prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString));
if (wait) for (int i = 0; i < 30; i++) { if (wait) for (int i = 0; i < 30; i++) {

@ -173,7 +173,6 @@ public class QuickCrawlLink_p {
pe.handle(), pe.handle(),
0, 0,
0, 0,
0,
0 0
)); ));

@ -80,9 +80,7 @@ public class rct_p {
sb.crawler.defaultRemoteProfile.handle(), sb.crawler.defaultRemoteProfile.handle(),
0, 0,
0, 0,
0, 0));
item.getSize()
));
} else { } else {
env.getLog().warn("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason); env.getLog().warn("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
} }

@ -57,6 +57,7 @@ import net.yacy.cora.protocol.TimeoutRequest;
import net.yacy.cora.protocol.ftp.FTPClient; import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.CharacterCoding;
/** /**
@ -1353,7 +1354,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
} else if (host_tld.equals("lk")) {//Sri Lanka /1,770,000 } else if (host_tld.equals("lk")) {//Sri Lanka /1,770,000
language = "si";//sinhala; sin language = "si";//sinhala; sin
//language = "ta";//tamil; tam //language = "ta";//tamil; tam
} else if (host_tld.equals("la")) {//Laos (Lao Peoples Democratic Republic) /932,000 } else if (host_tld.equals("la")) {//Laos (Lao People<EFBFBD><EFBFBD><EFBFBD>s Democratic Republic) /932,000
language = "lo";//lao; lao language = "lo";//lao; lao
} else if (host_tld.equals("ly")) {//Libya /388,000 } else if (host_tld.equals("ly")) {//Libya /388,000
language = "ar";//libyan arabic; ara; ayl language = "ar";//libyan arabic; ara; ayl
@ -1597,7 +1598,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
//language = "en";//english (official) //language = "en";//english (official)
} else if (host_tld.equals("sb")) {//Solomon Islands /11,800 } else if (host_tld.equals("sb")) {//Solomon Islands /11,800
language = "en";//Pijin (Solomons Pidgin or Neo-Solomonic); cpe; pis language = "en";//Pijin (Solomons Pidgin or Neo-Solomonic); cpe; pis
//language = "en";//english (12%) //language = "en";//english (1<EFBFBD><EFBFBD><EFBFBD>2%)
} else if (host_tld.equals("sd")) {//Sudan /11,700 } else if (host_tld.equals("sd")) {//Sudan /11,700
language = "ar";//sudanese arabic; ara; apd language = "ar";//sudanese arabic; ara; apd
//language = "en";//english //language = "en";//english
@ -1995,11 +1996,13 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
if (isFile()) try { if (isFile()) try {
return getFSFile().length(); return getFSFile().length();
} catch (final Throwable e) { } catch (final Throwable e) {
ConcurrentLog.logException(e);
return -1; return -1;
} }
if (isSMB()) try { if (isSMB()) try {
return TimeoutRequest.length(getSmbFile(), SMB_TIMEOUT); return TimeoutRequest.length(getSmbFile(), SMB_TIMEOUT);
} catch (final Throwable e) { } catch (final Throwable e) {
ConcurrentLog.logException(e);
return -1; return -1;
} }
return -1; return -1;

@ -43,6 +43,7 @@ import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.NumberTools; import net.yacy.cora.util.NumberTools;
@ -458,6 +459,7 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
try { try {
return (int) Long.parseLong(get(CONTENT_LENGTH)); return (int) Long.parseLong(get(CONTENT_LENGTH));
} catch (final NumberFormatException e) { } catch (final NumberFormatException e) {
ConcurrentLog.warn("HeaderFramework", "content-length cannot be parsed: " + get(CONTENT_LENGTH));
return -1; return -1;
} }
} }

@ -50,10 +50,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.NoticedURL; import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.retrieval.FTPLoader;
import net.yacy.crawler.retrieval.HTTPLoader;
import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.SMBLoader;
import net.yacy.crawler.robots.RobotsTxt; import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.citation.CitationReference;
@ -228,7 +225,6 @@ public final class CrawlStacker {
profileHandle, profileHandle,
0, 0,
0, 0,
0,
0 0
)); ));
} }
@ -270,9 +266,7 @@ public final class CrawlStacker {
profileHandle, profileHandle,
0, 0,
0, 0,
0, 0));
entry.size
));
} }
} catch (final IOException e1) { } catch (final IOException e1) {
ConcurrentLog.logException(e1); ConcurrentLog.logException(e1);
@ -298,9 +292,7 @@ public final class CrawlStacker {
pe.handle(), pe.handle(),
0, 0,
0, 0,
0, 0));
0
));
} }
/** /**
@ -344,19 +336,10 @@ public final class CrawlStacker {
return error; return error;
} }
long maxFileSize = Long.MAX_VALUE;
if (!entry.isEmpty()) {
final String protocol = entry.url().getProtocol();
if (protocol.equals("http") || protocol.equals("https")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
if (protocol.equals("ftp")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.ftp.maxFileSize", FTPLoader.DEFAULT_MAXFILESIZE);
if (protocol.equals("smb")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.smb.maxFileSize", SMBLoader.DEFAULT_MAXFILESIZE);
}
// check availability of parser and maxfilesize // check availability of parser and maxfilesize
String warning = null; String warning = null;
ContentDomain contentDomain = entry.url().getContentDomainFromExt(); ContentDomain contentDomain = entry.url().getContentDomainFromExt();
if ((maxFileSize >= 0 && entry.size() > maxFileSize) || if (contentDomain == ContentDomain.APP ||
contentDomain == ContentDomain.APP ||
(contentDomain == ContentDomain.IMAGE && TextParser.supportsExtension(entry.url()) != null) || (contentDomain == ContentDomain.IMAGE && TextParser.supportsExtension(entry.url()) != null) ||
contentDomain == ContentDomain.AUDIO || contentDomain == ContentDomain.AUDIO ||
contentDomain == ContentDomain.VIDEO || contentDomain == ContentDomain.VIDEO ||

@ -524,8 +524,7 @@ public class CrawlQueues {
this.sb.crawler.defaultRemoteProfile.handle(), this.sb.crawler.defaultRemoteProfile.handle(),
0, 0,
0, 0,
0, 0
item.getSize()
)); ));
} else { } else {
CrawlQueues.log.warn("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason); CrawlQueues.log.warn("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);

@ -92,7 +92,6 @@ public class Request extends WorkflowJob
private int anchors; // number of anchors of the parent private int anchors; // number of anchors of the parent
private int forkfactor; // sum of anchors of all ancestors private int forkfactor; // sum of anchors of all ancestors
private Bitfield flags; private Bitfield flags;
private long size; // size of resource in bytes (if known) or 0 if not known
private String statusMessage; private String statusMessage;
private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection
@ -111,7 +110,6 @@ public class Request extends WorkflowJob
this.statusMessage = null; this.statusMessage = null;
this.initialHash = 0; this.initialHash = 0;
this.status = 0; this.status = 0;
this.size = 0;
} }
/** /**
@ -121,7 +119,7 @@ public class Request extends WorkflowJob
* @param referrerhash * @param referrerhash
*/ */
public Request(final DigestURL url, final byte[] referrerhash) { public Request(final DigestURL url, final byte[] referrerhash) {
this(null, url, referrerhash, null, null, null, 0, 0, 0, 0); this(null, url, referrerhash, null, null, null, 0, 0, 0);
} }
/** /**
@ -146,8 +144,7 @@ public class Request extends WorkflowJob
final String profileHandle, final String profileHandle,
final int depth, final int depth,
final int anchors, final int anchors,
final int forkfactor, final int forkfactor) {
final long size) {
// create new entry and store it into database // create new entry and store it into database
assert url != null; assert url != null;
assert profileHandle == null || profileHandle.length() == Word.commonHashLength : profileHandle assert profileHandle == null || profileHandle.length() == Word.commonHashLength : profileHandle
@ -167,7 +164,6 @@ public class Request extends WorkflowJob
this.statusMessage = "loaded(args)"; this.statusMessage = "loaded(args)";
this.initialHash = url.hashCode(); this.initialHash = url.hashCode();
this.status = WorkflowJob.STATUS_INITIATED; this.status = WorkflowJob.STATUS_INITIATED;
this.size = size;
} }
public Request(final Row.Entry entry) throws IOException { public Request(final Row.Entry entry) throws IOException {
@ -195,7 +191,6 @@ public class Request extends WorkflowJob
this.flags = new Bitfield(entry.getColBytes(10, true)); this.flags = new Bitfield(entry.getColBytes(10, true));
//this.loaddate = entry.getColLong(12); //this.loaddate = entry.getColLong(12);
//this.lastmodified = entry.getColLong(13); //this.lastmodified = entry.getColLong(13);
this.size = entry.getColLong(14);
this.statusMessage = "loaded(kelondroRow.Entry)"; this.statusMessage = "loaded(kelondroRow.Entry)";
this.initialHash = this.url.hashCode(); this.initialHash = this.url.hashCode();
} catch (final Throwable e ) { } catch (final Throwable e ) {
@ -224,7 +219,6 @@ public class Request extends WorkflowJob
final byte[] appdatestr = NaturalOrder.encodeLong(this.appdate, rowdef.width(5)); final byte[] appdatestr = NaturalOrder.encodeLong(this.appdate, rowdef.width(5));
final byte[] loaddatestr = NaturalOrder.encodeLong(0 /*loaddate*/, rowdef.width(12)); final byte[] loaddatestr = NaturalOrder.encodeLong(0 /*loaddate*/, rowdef.width(12));
final byte[] serverdatestr = NaturalOrder.encodeLong(0 /*lastmodified*/, rowdef.width(13)); final byte[] serverdatestr = NaturalOrder.encodeLong(0 /*lastmodified*/, rowdef.width(13));
final byte[] sizestr = NaturalOrder.encodeLong(this.size, rowdef.width(14));
// store the hash in the hash cache // store the hash in the hash cache
final byte[] namebytes = UTF8.getBytes(this.name); final byte[] namebytes = UTF8.getBytes(this.name);
final byte[][] entry = final byte[][] entry =
@ -243,7 +237,7 @@ public class Request extends WorkflowJob
NaturalOrder.encodeLong(0, rowdef.width(11)), NaturalOrder.encodeLong(0, rowdef.width(11)),
loaddatestr, loaddatestr,
serverdatestr, serverdatestr,
sizestr new byte[0] // dummy, not used (any more)
}; };
return rowdef.newEntry(entry); return rowdef.newEntry(entry);
} }
@ -277,27 +271,7 @@ public class Request extends WorkflowJob
// the date when the url appeared first // the date when the url appeared first
return new Date(this.appdate); return new Date(this.appdate);
} }
/*
public Date loaddate() {
// the date when the url was loaded
return new Date(this.loaddate);
}
public Date lastmodified() {
// the date that the server returned as document date
return new Date(this.lastmodified);
}
*/
public long size() {
// the date that the client (browser) send as ifModifiedSince in proxy mode
return this.size;
}
public boolean isEmpty() {
return this.size == 0;
}
public String name() { public String name() {
// return the anchor name (text inside <a> tag) // return the anchor name (text inside <a> tag)
return this.name; return this.name;

@ -159,6 +159,9 @@ public class Response {
this.status = QUEUE_STATE_FRESH; this.status = QUEUE_STATE_FRESH;
this.content = content; this.content = content;
this.fromCache = fromCache; this.fromCache = fromCache;
if (this.responseHeader != null && content != null && Integer.parseInt(this.responseHeader.get(HeaderFramework.CONTENT_LENGTH, "0")) <= content.length) {
this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Integer.toString(content.length)); // repair length
}
} }
/** /**
@ -173,11 +176,11 @@ public class Response {
this.requestHeader = new RequestHeader(); this.requestHeader = new RequestHeader();
this.responseHeader = new ResponseHeader(200); this.responseHeader = new ResponseHeader(200);
this.responseHeader.put(HeaderFramework.CONTENT_TYPE, Classification.ext2mime(MultiProtocolURL.getFileExtension(request.url().getFileName()), "text/plain")); // tell parser how to handle the content this.responseHeader.put(HeaderFramework.CONTENT_TYPE, Classification.ext2mime(MultiProtocolURL.getFileExtension(request.url().getFileName()), "text/plain")); // tell parser how to handle the content
if (!request.isEmpty()) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(request.size()));
this.profile = profile; this.profile = profile;
this.status = QUEUE_STATE_FRESH; this.status = QUEUE_STATE_FRESH;
this.content = request.name().length() > 0 ? UTF8.getBytes(request.name()) : UTF8.getBytes(request.url().toTokens()); this.content = request.name().length() > 0 ? UTF8.getBytes(request.name()) : UTF8.getBytes(request.url().toTokens());
this.fromCache = true; this.fromCache = true;
if (this.responseHeader != null) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, "0"); // 'virtual' length, shows that the resource was not loaded
} }
public Response( public Response(
@ -262,6 +265,9 @@ public class Response {
public void setContent(final byte[] data) { public void setContent(final byte[] data) {
this.content = data; this.content = data;
if (this.responseHeader != null && this.content != null && Integer.parseInt(this.responseHeader.get(HeaderFramework.CONTENT_LENGTH, "0")) <= content.length) {
this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Integer.toString(content.length)); // repair length
}
} }
public byte[] getContent() { public byte[] getContent() {

@ -104,7 +104,6 @@ public class SitemapImporter extends Thread {
this.crawlingProfile.handle(), this.crawlingProfile.handle(),
0, 0,
0, 0,
0,
0 0
)); ));
logger.info("New URL '" + entry.url() + "' added for loading."); logger.info("New URL '" + entry.url() + "' added for loading.");

@ -1,5 +1,5 @@
// YMarkCrawlStart.java // YMarkCrawlStart.java
// (C) 2012 by Stefan Förster, sof@gmx.de, Norderstedt, Germany // (C) 2012 by Stefan F<EFBFBD><EFBFBD>rster, sof@gmx.de, Norderstedt, Germany
// first published 2011 on http://yacy.net // first published 2011 on http://yacy.net
// //
// This is a part of YaCy, a peer-to-peer based web search engine // This is a part of YaCy, a peer-to-peer based web search engine
@ -195,7 +195,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
null, null,
"CRAWLING-ROOT", "CRAWLING-ROOT",
new Date(), new Date(),
pe.handle(), 0, 0, 0, 0 pe.handle(), 0, 0, 0
)); ));
} }
} }

@ -76,7 +76,6 @@ public class ProxyCacheHandler extends AbstractRemoteHandler implements Handler
sb.crawler.defaultProxyProfile.handle(), sb.crawler.defaultProxyProfile.handle(),
0, 0,
0, 0,
0,
0); 0);
final Response cachedResponse = new Response( final Response cachedResponse = new Response(

@ -189,7 +189,6 @@ public class ProxyHandler extends AbstractRemoteHandler implements Handler {
sb.crawler.defaultProxyProfile.handle(), sb.crawler.defaultProxyProfile.handle(),
0, 0,
0, 0,
0,
0); //sizeBeforeDelete < 0 ? 0 : sizeBeforeDelete); 0); //sizeBeforeDelete < 0 ? 0 : sizeBeforeDelete);
final Response yacyResponse = new Response( final Response yacyResponse = new Response(
yacyRequest, yacyRequest,
@ -251,7 +250,7 @@ public class ProxyHandler extends AbstractRemoteHandler implements Handler {
* adds specific header elements for the connection of the internal * adds specific header elements for the connection of the internal
* httpclient to the remote server according to local config * httpclient to the remote server according to local config
* *
* @param header header für http client (already preset with headers from * @param header header f<EFBFBD><EFBFBD>r http client (already preset with headers from
* original ServletRequest * original ServletRequest
* @param origServletRequest original request/header * @param origServletRequest original request/header
*/ */

@ -128,7 +128,6 @@ public final class LoaderDispatcher {
this.sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile this.sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile
0, 0,
0, 0,
0,
0); 0);
} }

@ -199,7 +199,6 @@ import net.yacy.search.ranking.RankingProfile;
import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.schema.WebgraphConfiguration; import net.yacy.search.schema.WebgraphConfiguration;
import net.yacy.search.schema.WebgraphSchema;
import net.yacy.server.serverCore; import net.yacy.server.serverCore;
import net.yacy.server.serverSwitch; import net.yacy.server.serverSwitch;
import net.yacy.server.http.RobotsTxtConfig; import net.yacy.server.http.RobotsTxtConfig;
@ -1926,7 +1925,6 @@ public final class Switchboard extends serverSwitch {
this.crawler.defaultSurrogateProfile.handle(), this.crawler.defaultSurrogateProfile.handle(),
0, 0,
0, 0,
0,
0); 0);
response = new Response(request, null, null, this.crawler.defaultSurrogateProfile, false); response = new Response(request, null, null, this.crawler.defaultSurrogateProfile, false);
final IndexingQueueEntry queueEntry = final IndexingQueueEntry queueEntry =
@ -2634,8 +2632,7 @@ public final class Switchboard extends serverSwitch {
response.profile().handle(), response.profile().handle(),
response.depth() + 1, response.depth() + 1,
0, 0,
0, 0));
response.size() < 0 ? 0 : response.size()));
} catch (final MalformedURLException e ) { } catch (final MalformedURLException e ) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
@ -3013,7 +3010,6 @@ public final class Switchboard extends serverSwitch {
profile.handle(), profile.handle(),
0, 0,
0, 0,
0,
0 0
)); ));

@ -380,7 +380,6 @@ public final class HTTPDProxyHandler {
sb.crawler.defaultProxyProfile.handle(), sb.crawler.defaultProxyProfile.handle(),
0, 0,
0, 0,
0,
0); 0);
final Response response = new Response( final Response response = new Response(
request, request,
@ -509,8 +508,7 @@ public final class HTTPDProxyHandler {
sb.crawler.defaultProxyProfile.handle(), sb.crawler.defaultProxyProfile.handle(),
0, 0,
0, 0,
0, 0);
sizeBeforeDelete < 0 ? 0 : sizeBeforeDelete);
// handle incoming cookies // handle incoming cookies

Loading…
Cancel
Save