fix for bad/missing values of size_i

pull/1/head
Michael Peter Christen 11 years ago
parent 6306d28a6a
commit b08375da33

@ -141,7 +141,7 @@ public class HostBrowser {
sb.peers.mySeed().hash.getBytes(),
url, null, load, new Date(),
sb.crawler.defaultProxyProfile.handle(),
0, 0, 0, 0
0, 0, 0
));
prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString));
if (wait) for (int i = 0; i < 30; i++) {

@ -173,7 +173,6 @@ public class QuickCrawlLink_p {
pe.handle(),
0,
0,
0,
0
));

@ -80,9 +80,7 @@ public class rct_p {
sb.crawler.defaultRemoteProfile.handle(),
0,
0,
0,
item.getSize()
));
0));
} else {
env.getLog().warn("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
}

@ -57,6 +57,7 @@ import net.yacy.cora.protocol.TimeoutRequest;
import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.parser.html.CharacterCoding;
/**
@ -1353,7 +1354,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
} else if (host_tld.equals("lk")) {//Sri Lanka /1,770,000
language = "si";//sinhala; sin
//language = "ta";//tamil; tam
} else if (host_tld.equals("la")) {//Laos (Lao Peoples Democratic Republic) /932,000
} else if (host_tld.equals("la")) {//Laos (Lao People<EFBFBD><EFBFBD><EFBFBD>s Democratic Republic) /932,000
language = "lo";//lao; lao
} else if (host_tld.equals("ly")) {//Libya /388,000
language = "ar";//libyan arabic; ara; ayl
@ -1597,7 +1598,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
//language = "en";//english (official)
} else if (host_tld.equals("sb")) {//Solomon Islands /11,800
language = "en";//Pijin (Solomons Pidgin or Neo-Solomonic); cpe; pis
//language = "en";//english (12%)
//language = "en";//english (1<EFBFBD><EFBFBD><EFBFBD>2%)
} else if (host_tld.equals("sd")) {//Sudan /11,700
language = "ar";//sudanese arabic; ara; apd
//language = "en";//english
@ -1995,11 +1996,13 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
if (isFile()) try {
return getFSFile().length();
} catch (final Throwable e) {
ConcurrentLog.logException(e);
return -1;
}
if (isSMB()) try {
return TimeoutRequest.length(getSmbFile(), SMB_TIMEOUT);
} catch (final Throwable e) {
ConcurrentLog.logException(e);
return -1;
}
return -1;

@ -43,6 +43,7 @@ import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.NumberTools;
@ -458,6 +459,7 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
try {
return (int) Long.parseLong(get(CONTENT_LENGTH));
} catch (final NumberFormatException e) {
ConcurrentLog.warn("HeaderFramework", "content-length cannot be parsed: " + get(CONTENT_LENGTH));
return -1;
}
}

@ -50,10 +50,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.retrieval.FTPLoader;
import net.yacy.crawler.retrieval.HTTPLoader;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.SMBLoader;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.citation.CitationReference;
@ -228,7 +225,6 @@ public final class CrawlStacker {
profileHandle,
0,
0,
0,
0
));
}
@ -270,9 +266,7 @@ public final class CrawlStacker {
profileHandle,
0,
0,
0,
entry.size
));
0));
}
} catch (final IOException e1) {
ConcurrentLog.logException(e1);
@ -298,9 +292,7 @@ public final class CrawlStacker {
pe.handle(),
0,
0,
0,
0
));
0));
}
/**
@ -344,19 +336,10 @@ public final class CrawlStacker {
return error;
}
long maxFileSize = Long.MAX_VALUE;
if (!entry.isEmpty()) {
final String protocol = entry.url().getProtocol();
if (protocol.equals("http") || protocol.equals("https")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
if (protocol.equals("ftp")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.ftp.maxFileSize", FTPLoader.DEFAULT_MAXFILESIZE);
if (protocol.equals("smb")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.smb.maxFileSize", SMBLoader.DEFAULT_MAXFILESIZE);
}
// check availability of parser and maxfilesize
String warning = null;
ContentDomain contentDomain = entry.url().getContentDomainFromExt();
if ((maxFileSize >= 0 && entry.size() > maxFileSize) ||
contentDomain == ContentDomain.APP ||
if (contentDomain == ContentDomain.APP ||
(contentDomain == ContentDomain.IMAGE && TextParser.supportsExtension(entry.url()) != null) ||
contentDomain == ContentDomain.AUDIO ||
contentDomain == ContentDomain.VIDEO ||

@ -524,8 +524,7 @@ public class CrawlQueues {
this.sb.crawler.defaultRemoteProfile.handle(),
0,
0,
0,
item.getSize()
0
));
} else {
CrawlQueues.log.warn("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);

@ -92,7 +92,6 @@ public class Request extends WorkflowJob
private int anchors; // number of anchors of the parent
private int forkfactor; // sum of anchors of all ancestors
private Bitfield flags;
private long size; // size of resource in bytes (if known) or 0 if not known
private String statusMessage;
private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection
@ -111,7 +110,6 @@ public class Request extends WorkflowJob
this.statusMessage = null;
this.initialHash = 0;
this.status = 0;
this.size = 0;
}
/**
@ -121,7 +119,7 @@ public class Request extends WorkflowJob
* @param referrerhash
*/
public Request(final DigestURL url, final byte[] referrerhash) {
this(null, url, referrerhash, null, null, null, 0, 0, 0, 0);
this(null, url, referrerhash, null, null, null, 0, 0, 0);
}
/**
@ -146,8 +144,7 @@ public class Request extends WorkflowJob
final String profileHandle,
final int depth,
final int anchors,
final int forkfactor,
final long size) {
final int forkfactor) {
// create new entry and store it into database
assert url != null;
assert profileHandle == null || profileHandle.length() == Word.commonHashLength : profileHandle
@ -167,7 +164,6 @@ public class Request extends WorkflowJob
this.statusMessage = "loaded(args)";
this.initialHash = url.hashCode();
this.status = WorkflowJob.STATUS_INITIATED;
this.size = size;
}
public Request(final Row.Entry entry) throws IOException {
@ -195,7 +191,6 @@ public class Request extends WorkflowJob
this.flags = new Bitfield(entry.getColBytes(10, true));
//this.loaddate = entry.getColLong(12);
//this.lastmodified = entry.getColLong(13);
this.size = entry.getColLong(14);
this.statusMessage = "loaded(kelondroRow.Entry)";
this.initialHash = this.url.hashCode();
} catch (final Throwable e ) {
@ -224,7 +219,6 @@ public class Request extends WorkflowJob
final byte[] appdatestr = NaturalOrder.encodeLong(this.appdate, rowdef.width(5));
final byte[] loaddatestr = NaturalOrder.encodeLong(0 /*loaddate*/, rowdef.width(12));
final byte[] serverdatestr = NaturalOrder.encodeLong(0 /*lastmodified*/, rowdef.width(13));
final byte[] sizestr = NaturalOrder.encodeLong(this.size, rowdef.width(14));
// store the hash in the hash cache
final byte[] namebytes = UTF8.getBytes(this.name);
final byte[][] entry =
@ -243,7 +237,7 @@ public class Request extends WorkflowJob
NaturalOrder.encodeLong(0, rowdef.width(11)),
loaddatestr,
serverdatestr,
sizestr
new byte[0] // dummy, not used (any more)
};
return rowdef.newEntry(entry);
}
@ -278,26 +272,6 @@ public class Request extends WorkflowJob
return new Date(this.appdate);
}
/*
public Date loaddate() {
// the date when the url was loaded
return new Date(this.loaddate);
}
public Date lastmodified() {
// the date that the server returned as document date
return new Date(this.lastmodified);
}
*/
public long size() {
// the date that the client (browser) send as ifModifiedSince in proxy mode
return this.size;
}
public boolean isEmpty() {
return this.size == 0;
}
public String name() {
// return the anchor name (text inside <a> tag)
return this.name;

@ -159,6 +159,9 @@ public class Response {
this.status = QUEUE_STATE_FRESH;
this.content = content;
this.fromCache = fromCache;
if (this.responseHeader != null && content != null && Integer.parseInt(this.responseHeader.get(HeaderFramework.CONTENT_LENGTH, "0")) <= content.length) {
this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Integer.toString(content.length)); // repair length
}
}
/**
@ -173,11 +176,11 @@ public class Response {
this.requestHeader = new RequestHeader();
this.responseHeader = new ResponseHeader(200);
this.responseHeader.put(HeaderFramework.CONTENT_TYPE, Classification.ext2mime(MultiProtocolURL.getFileExtension(request.url().getFileName()), "text/plain")); // tell parser how to handle the content
if (!request.isEmpty()) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(request.size()));
this.profile = profile;
this.status = QUEUE_STATE_FRESH;
this.content = request.name().length() > 0 ? UTF8.getBytes(request.name()) : UTF8.getBytes(request.url().toTokens());
this.fromCache = true;
if (this.responseHeader != null) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, "0"); // 'virtual' length, shows that the resource was not loaded
}
public Response(
@ -262,6 +265,9 @@ public class Response {
public void setContent(final byte[] data) {
this.content = data;
if (this.responseHeader != null && this.content != null && Integer.parseInt(this.responseHeader.get(HeaderFramework.CONTENT_LENGTH, "0")) <= content.length) {
this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Integer.toString(content.length)); // repair length
}
}
public byte[] getContent() {

@ -104,7 +104,6 @@ public class SitemapImporter extends Thread {
this.crawlingProfile.handle(),
0,
0,
0,
0
));
logger.info("New URL '" + entry.url() + "' added for loading.");

@ -1,5 +1,5 @@
// YMarkCrawlStart.java
// (C) 2012 by Stefan Förster, sof@gmx.de, Norderstedt, Germany
// (C) 2012 by Stefan F<EFBFBD><EFBFBD>rster, sof@gmx.de, Norderstedt, Germany
// first published 2011 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
@ -195,7 +195,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
null,
"CRAWLING-ROOT",
new Date(),
pe.handle(), 0, 0, 0, 0
pe.handle(), 0, 0, 0
));
}
}

@ -76,7 +76,6 @@ public class ProxyCacheHandler extends AbstractRemoteHandler implements Handler
sb.crawler.defaultProxyProfile.handle(),
0,
0,
0,
0);
final Response cachedResponse = new Response(

@ -189,7 +189,6 @@ public class ProxyHandler extends AbstractRemoteHandler implements Handler {
sb.crawler.defaultProxyProfile.handle(),
0,
0,
0,
0); //sizeBeforeDelete < 0 ? 0 : sizeBeforeDelete);
final Response yacyResponse = new Response(
yacyRequest,
@ -251,7 +250,7 @@ public class ProxyHandler extends AbstractRemoteHandler implements Handler {
* adds specific header elements for the connection of the internal
* httpclient to the remote server according to local config
*
* @param header header für http client (already preset with headers from
* @param header header f<EFBFBD><EFBFBD>r http client (already preset with headers from
* original ServletRequest
* @param origServletRequest original request/header
*/

@ -128,7 +128,6 @@ public final class LoaderDispatcher {
this.sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile
0,
0,
0,
0);
}

@ -199,7 +199,6 @@ import net.yacy.search.ranking.RankingProfile;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.schema.WebgraphConfiguration;
import net.yacy.search.schema.WebgraphSchema;
import net.yacy.server.serverCore;
import net.yacy.server.serverSwitch;
import net.yacy.server.http.RobotsTxtConfig;
@ -1926,7 +1925,6 @@ public final class Switchboard extends serverSwitch {
this.crawler.defaultSurrogateProfile.handle(),
0,
0,
0,
0);
response = new Response(request, null, null, this.crawler.defaultSurrogateProfile, false);
final IndexingQueueEntry queueEntry =
@ -2634,8 +2632,7 @@ public final class Switchboard extends serverSwitch {
response.profile().handle(),
response.depth() + 1,
0,
0,
response.size() < 0 ? 0 : response.size()));
0));
} catch (final MalformedURLException e ) {
ConcurrentLog.logException(e);
}
@ -3013,7 +3010,6 @@ public final class Switchboard extends serverSwitch {
profile.handle(),
0,
0,
0,
0
));

@ -380,7 +380,6 @@ public final class HTTPDProxyHandler {
sb.crawler.defaultProxyProfile.handle(),
0,
0,
0,
0);
final Response response = new Response(
request,
@ -509,8 +508,7 @@ public final class HTTPDProxyHandler {
sb.crawler.defaultProxyProfile.handle(),
0,
0,
0,
sizeBeforeDelete < 0 ? 0 : sizeBeforeDelete);
0);
// handle incoming cookies

Loading…
Cancel
Save