- code cleanup / added debug line for further investigation in HTTPDemon.parseMultipart

- changed data structure for sorting in search which performs better in that specific case (too many updates)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7150 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent ac1c08924e
commit 5870b13f3a

@ -57,15 +57,6 @@ public final class HTTPLoader {
* The socket timeout that should be used
*/
private final int socketTimeout;
/**
* The maximum allowed file size
*/
//private long maxFileSize = -1;
//private String acceptEncoding;
//private String acceptLanguage;
//private String acceptCharset;
private final Switchboard sb;
private final Log log;
@ -119,27 +110,20 @@ public final class HTTPLoader {
requestHeader.put(HeaderFramework.ACCEPT_ENCODING, sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));
// HTTP-Client
// final Client client = new Client(socketTimeout, requestHeader);
// ResponseContainer res = null;
final HTTPClient client = new HTTPClient();
client.setTimout(socketTimeout);
client.setHeader(requestHeader.entrySet());
// try {
// send request
// res = client.GET(request.url().toString(), maxFileSize);
final byte[] responseBody = client.GETbytes(request.url().toString(), maxFileSize);
final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
final int code = client.getHttpResponse().getStatusLine().getStatusCode();
// FIXME: 30*-handling (bottom) is never reached
// we always get the final content because httpClient.followRedirects = true
// if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
if (responseBody != null && (code == 200 || code == 203)) {
// the transfer is ok
// we write the new cache entry to file system directly
// res.setAccountingName("CRAWLER");
// final byte[] responseBody = res.getData();
long contentLength = responseBody.length;
ByteCount.addAccountCount(ByteCount.CRAWLER, contentLength);
@ -154,8 +138,6 @@ public final class HTTPLoader {
response = new Response(
request,
requestHeader,
// res.getResponseHeader(),
// res.getStatusLine(),
header,
Integer.toString(code),
mp == null ? null : new CrawlProfile(mp),
@ -163,12 +145,9 @@ public final class HTTPLoader {
);
return response;
// } else if (res.getStatusLine().startsWith("30")) {
// if (res.getResponseHeader().containsKey(HeaderFramework.LOCATION)) {
} else if (code > 299 && code < 310) {
if (header.containsKey(HeaderFramework.LOCATION)) {
// getting redirection URL
// String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION);
String redirectionUrlString = header.get(HeaderFramework.LOCATION);
redirectionUrlString = redirectionUrlString.trim();
@ -181,7 +160,6 @@ public final class HTTPLoader {
final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
// restart crawling with new url
// this.log.logInfo("CRAWLER Redirection detected ('" + res.getStatusLine() + "') for URL " + request.url().toString());
this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + request.url().toString());
this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl);
@ -204,17 +182,9 @@ public final class HTTPLoader {
}
} else {
// if the response has not the right response type then reject file
// sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + res.getStatusCode() + ")");
// throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + request.url().toString());
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + code + ")");
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
}
// } finally {
// if(res != null) {
// // release connection
// res.closeStream();
// }
// }
return response;
}
@ -251,22 +221,15 @@ public final class HTTPLoader {
requestHeader.put(HeaderFramework.ACCEPT_CHARSET, DEFAULT_CHARSET);
requestHeader.put(HeaderFramework.ACCEPT_ENCODING, DEFAULT_ENCODING);
// HTTP-Client
// final Client client = new Client(20000, requestHeader);
// ResponseContainer res = null;
final HTTPClient client = new HTTPClient();
client.setTimout(20000);
client.setHeader(requestHeader.entrySet());
// try {
// send request
// res = client.GET(request.url().toString(), Long.MAX_VALUE);
final byte[] responseBody = client.GETbytes(request.url().toString(), Long.MAX_VALUE);
final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
final int code = client.getHttpResponse().getStatusLine().getStatusCode();
// FIXME: 30*-handling (bottom) is never reached
// we always get the final content because httpClient.followRedirects = true
// if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
if (responseBody != null && (code == 200 || code == 203)) {
// the transfer is ok
@ -274,15 +237,11 @@ public final class HTTPLoader {
ByteCount.addAccountCount(ByteCount.CRAWLER, responseBody.length);
// we write the new cache entry to file system directly
// res.setAccountingName("CRAWLER");
// final byte[] responseBody = res.getData();
// create a new cache entry
response = new Response(
request,
requestHeader,
// res.getResponseHeader(),
// res.getStatusLine(),
header,
Integer.toString(code),
null,
@ -290,12 +249,9 @@ public final class HTTPLoader {
);
return response;
// } else if (res.getStatusLine().startsWith("30")) {
// if (res.getResponseHeader().containsKey(HeaderFramework.LOCATION)) {
} else if (code > 299 && code < 310) {
if (header.containsKey(HeaderFramework.LOCATION)) {
// getting redirection URL
// String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION);
String redirectionUrlString = header.get(HeaderFramework.LOCATION);
redirectionUrlString = redirectionUrlString.trim();
@ -318,15 +274,8 @@ public final class HTTPLoader {
}
} else {
// if the response has not the right response type then reject file
// throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + request.url().toString());
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
}
// } finally {
// if(res != null) {
// // release connection
// res.closeStream();
// }
// }
return response;
}

@ -1032,22 +1032,7 @@ public final class HTTPDFileHandler {
// flush all
try {newOut.flush();}catch (final Exception e) {}
/*
// wait a little time until everything closes so that clients can read from the streams/sockets
if ((contentLength >= 0) && (requestHeader.get(RequestHeader.CONNECTION, "close")).indexOf("keep-alive") == -1) {
// in case that the client knows the size in advance (contentLength present) the waiting will have no effect on the interface performance
// but if the client waits on a connection interruption this will slow down.
try {Thread.sleep(2000);} catch (final InterruptedException e) {} // FIXME: is this necessary?
}
*/
}
// check mime type again using the result array: these are 'magics'
// if (serverByteBuffer.equals(result, 1, "PNG".getBytes())) mimeType = mimeTable.getProperty("png","text/html");
// else if (serverByteBuffer.equals(result, 0, "GIF89".getBytes())) mimeType = mimeTable.getProperty("gif","text/html");
// else if (serverByteBuffer.equals(result, 6, "JFIF".getBytes())) mimeType = mimeTable.getProperty("jpg","text/html");
//System.out.print("MAGIC:"); for (int i = 0; i < 10; i++) System.out.print(Integer.toHexString((int) result[i]) + ","); System.out.println();
}
} else {
HTTPDemon.sendRespondError(conProp,out,3,404,"File not Found",null,null);
@ -1055,8 +1040,7 @@ public final class HTTPDFileHandler {
}
} catch (final Exception e) {
try {
// doing some errorhandling ...
//Log.logException(e);
// error handling
int httpStatusCode = 400;
final String httpStatusText = null;
final StringBuilder errorMessage = new StringBuilder(2000);

@ -484,21 +484,16 @@ public final class HTTPDProxyHandler {
// send request
try {
// res = client.GET(getUrl);
// if (log.isFinest()) log.logFinest(reqID +" response status: "+ res.getStatusLine());
client.GET(getUrl);
if (log.isFinest()) log.logFinest(reqID +" response status: "+ client.getHttpResponse().getStatusLine());
conProp.put(HeaderFramework.CONNECTION_PROP_CLIENT_REQUEST_HEADER, requestHeader);
// final ResponseHeader responseHeader = res.getResponseHeader();
final ResponseHeader responseHeader = new ResponseHeader(client.getHttpResponse().getAllHeaders());
// determine if it's an internal error of the httpc
if (responseHeader.isEmpty()) {
// throw new Exception(res.getStatusLine());
throw new Exception(client.getHttpResponse().getStatusLine().toString());
}
// final ChunkedOutputStream chunkedOut = setTransferEncoding(conProp, responseHeader, res.getStatusCode(), respond);
final ChunkedOutputStream chunkedOut = setTransferEncoding(conProp, responseHeader, client.getHttpResponse().getStatusLine().getStatusCode(), respond);
// the cache does either not exist or is (supposed to be) stale
@ -539,13 +534,6 @@ public final class HTTPDProxyHandler {
}
if (log.isFinest()) log.logFinest(reqID +" sending response header: "+ responseHeader);
// HTTPDemon.sendRespondHeader(
// conProp,
// respond,
// httpVer,
// res.getStatusCode(),
// res.getStatusLine().substring(4), // status text
// responseHeader);
HTTPDemon.sendRespondHeader(
conProp,
respond,
@ -554,7 +542,6 @@ public final class HTTPDProxyHandler {
client.getHttpResponse().getStatusLine().toString(), // status text
responseHeader);
// if (hasBody(res.getStatusCode())) {
if (hasBody(client.getHttpResponse().getStatusLine().getStatusCode())) {
final OutputStream outStream = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond);
@ -562,7 +549,6 @@ public final class HTTPDProxyHandler {
request,
requestHeader,
responseHeader,
// res.getStatusLine(),
Integer.toString(client.getHttpResponse().getStatusLine().getStatusCode()),
sb.crawler.defaultProxyProfile
);
@ -940,47 +926,19 @@ public final class HTTPDProxyHandler {
if(body == null) {
log.logSevere("no body to POST!");
}
// from old httpc:
// "if there is a body to the call, we would have a CONTENT-LENGTH tag in the requestHeader"
// it seems that it is a HTTP/1.1 connection which stays open (the inputStream) and endlessly waits for
// input so we have to end it to do the request
// this should not be needed anymore - see org.apache.http.entity.InputStreamEntity
// final int contentLength = requestHeader.getContentLength();
// if (contentLength > -1) {
// final byte[] bodyData;
// if(contentLength == 0) {
// // no body
// bodyData = new byte[0];
// } else {
// // read content-length bytes into memory
// bodyData = new byte[contentLength];
// int bytes_read = 0;
// while(bytes_read < contentLength) {
// bytes_read += body.read(bodyData, bytes_read, contentLength-bytes_read);
// }
// }
// body = new ByteArrayInputStream(bodyData);
// }
// ResponseContainer res = null;
try {
// sending the request
// res = client.POST(getUrl, body);
// if (log.isFinest()) log.logFinest(reqID +" response status: "+ res.getStatusLine());
client.POST(getUrl, body, contentLength);
if (log.isFinest()) log.logFinest(reqID +" response status: "+ client.getHttpResponse().getStatusLine());
// final ResponseHeader responseHeader = res.getResponseHeader();
final ResponseHeader responseHeader = new ResponseHeader(client.getHttpResponse().getAllHeaders());
// determine if it's an internal error of the httpc
if (responseHeader.isEmpty()) {
// throw new Exception(res.getStatusLine());
throw new Exception(client.getHttpResponse().getStatusLine().toString());
}
// final ChunkedOutputStream chunked = setTransferEncoding(conProp, responseHeader, res.getStatusCode(), countedRespond);
final ChunkedOutputStream chunked = setTransferEncoding(conProp, responseHeader, client.getHttpResponse().getStatusLine().getStatusCode(), countedRespond);
// prepareResponseHeader(responseHeader, res.getHttpVer());
prepareResponseHeader(responseHeader, client.getHttpResponse().getProtocolVersion().toString());
// sending the respond header back to the client
@ -990,12 +948,6 @@ public final class HTTPDProxyHandler {
// sending response headers
if (log.isFinest()) log.logFinest(reqID +" sending response header: "+ responseHeader);
// HTTPDemon.sendRespondHeader(conProp,
// countedRespond,
// httpVer,
// res.getStatusCode(),
// res.getStatusLine().substring(4), // status text
// responseHeader);
HTTPDemon.sendRespondHeader(conProp,
countedRespond,
httpVer,
@ -1003,19 +955,7 @@ public final class HTTPDProxyHandler {
client.getHttpResponse().getStatusLine().toString(), // status text
responseHeader);
// respondHeader(respond, res.status, res.responseHeader);
// Saver.writeContent(res, (chunked != null) ? new BufferedOutputStream(chunked) : new BufferedOutputStream(respond));
/*
// *** (Uebernommen aus Saver-Klasse: warum ist dies hier die einzige Methode, die einen OutputStream statt einen Writer benutzt?)
try {
serverFileUtils.copyToStream(new BufferedInputStream(res.getDataAsStream()), (chunked != null) ? new BufferedOutputStream(chunked) : new BufferedOutputStream(respond));
} finally {
res.closeStream();
}
if (chunked != null) chunked.finish();
*/
final OutputStream outStream = (chunked != null) ? chunked : countedRespond;
// FileUtils.copy(res.getDataAsStream(), outStream);
client.writeTo(outStream);
if (chunked != null) {
@ -1024,14 +964,8 @@ public final class HTTPDProxyHandler {
outStream.flush();
} catch(SocketException se) {
// connection closed by client, abort download
// res.abort();
client.finish();
} finally {
// if opened ...
// if(res != null) {
// // ... close connection
// res.closeStream();
// }
client.finish();
}
} catch (final Exception e) {
@ -1118,8 +1052,6 @@ public final class HTTPDProxyHandler {
*/
private static HTTPClient setupHttpClient(final RequestHeader requestHeader, final String connectHost) {
// setup HTTP-client
// final Client client = new Client(timeout, requestHeader);
// client.setFollowRedirects(false);
final HTTPClient client = new HTTPClient();
client.setTimout(timeout);
client.setHeader(requestHeader.entrySet());
@ -1293,20 +1225,13 @@ public final class HTTPDProxyHandler {
// possibly branch into PROXY-PROXY connection
if (ProxySettings.use && ProxySettings.use4ssl) {
// final Client remoteProxy = new Client(timeout, requestHeader);
// remoteProxy.setFollowRedirects(false); // should not be needed, but safe is safe
final HTTPClient remoteProxy = setupHttpClient(requestHeader, host);
// ResponseContainer response = null;
try {
// response = remoteProxy.CONNECT(host, port);
remoteProxy.HEADResponse("http://" + host + ":" + port);
ResponseHeader header = new ResponseHeader(remoteProxy.getHttpResponse().getAllHeaders());
// outputs a logline to the serverlog with the current status
// log.logInfo("CONNECT-RESPONSE: status=" + response.getStatusLine() + ", header=" + response.getResponseHeader().toString());
// // (response.getStatusLine().charAt(0) == '2') || (response.getStatusLine().charAt(0) == '3')
// final boolean success = response.getStatusCode() >= 200 && response.getStatusCode() <= 399;
log.logInfo("CONNECT-RESPONSE: status=" + remoteProxy.getHttpResponse().getStatusLine() + ", header=" + header.toString());
final boolean success = remoteProxy.getHttpResponse().getStatusLine().getStatusCode() >= 200 && remoteProxy.getHttpResponse().getStatusLine().getStatusCode() <= 399;
if (success) {
@ -1316,7 +1241,6 @@ public final class HTTPDProxyHandler {
// go on (see below)
} else {
// pass error response back to client
// HTTPDemon.sendRespondHeader(conProp,clientOut,httpVersion,response.getStatusCode(),response.getStatusLine().substring(4),response.getResponseHeader());
HTTPDemon.sendRespondHeader(
conProp,
clientOut,
@ -1328,16 +1252,8 @@ public final class HTTPDProxyHandler {
forceConnectionClose(conProp);
return;
}
// } catch (SocketException se) {
// // connection closed by client, abort download
// response.abort();
} catch (final Exception e) {
throw new IOException(e.getMessage());
// } finally {
// if(response != null) {
// // release connection
// response.closeStream();
// }
}
}

@ -802,8 +802,8 @@ public final class HTTPDemon implements serverHandler, Cloneable {
* @throws IOException
*/
@SuppressWarnings("unchecked")
public static Map<String, byte[]> parseMultipart(final RequestHeader header, final serverObjects args, final InputStream in)
throws IOException {
public static Map<String, byte[]> parseMultipart(final RequestHeader header, final serverObjects args, final InputStream in) throws IOException {
//ByteArrayInputStream in = new ByteArrayInputStream(FileUtils.read(inx));
final InputStream body = prepareBody(header, in);
RequestContext request = new yacyContextRequest(header, body);
@ -821,13 +821,15 @@ public final class HTTPDemon implements serverHandler, Cloneable {
// parse data in memory
FileUpload upload = new FileUpload(diskFileItemFactory);
List<FileItem> items;
long time = System.currentTimeMillis();
try {
items = upload.parseRequest(request);
} catch (FileUploadException e) {
//Log.logException(e);
throw new IOException("FileUploadException " + e.getMessage());
}
System.out.println("**** FileUploadBase.parseRequest time = " + (System.currentTimeMillis() - time));
// format information for further usage
final HashMap<String, byte[]> files = new HashMap<String, byte[]>();
for (FileItem item : items) {

@ -35,6 +35,7 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
@ -50,8 +51,6 @@ import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.index.BinSearch;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Digest;
import net.yacy.kelondro.rwi.ReferenceContainer;
@ -69,9 +68,9 @@ public final class RankingProcess extends Thread {
private static final int maxDoubleDomAll = 100, maxDoubleDomSpecial = 10000;
private final QueryParams query;
private final HandleSet urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
private final TreeSet<byte[]> urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
private final int[] flagcount; // flag counter
private final HandleSet misses; // contains url-hashes that could not been found in the LURL-DB
private final TreeSet<byte[]> misses; // contains url-hashes that could not been found in the LURL-DB
//private final int[] domZones;
private TreeMap<byte[], ReferenceContainer<WordReference>> localSearchInclusion;
@ -102,8 +101,10 @@ public final class RankingProcess extends Thread {
this.remote_indexCount = 0;
this.local_resourceSize = 0;
this.local_indexCount = 0;
this.urlhashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
this.misses = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
this.urlhashes = new TreeSet<byte[]>(URIMetadataRow.rowdef.objectOrder);
//this.urlhashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
this.misses = new TreeSet<byte[]>(URIMetadataRow.rowdef.objectOrder);
//this.misses = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
this.flagcount = new int[32];
for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;}
this.hostNavigator = new Navigator();
@ -221,13 +222,8 @@ public final class RankingProcess extends Thread {
this.hostNavigator.inc(domhash, uhb);
}
// accept; insert to ranked stack with double-check
try {
if (!urlhashes.put(iEntry.metadataHash())) {
stack.put(new ReverseElement<WordReferenceVars>(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
}
} catch (RowSpaceExceededException e) {
Log.logException(e);
if (urlhashes.add(iEntry.metadataHash())) {
stack.put(new ReverseElement<WordReferenceVars>(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
}
// increase counter for statistics
@ -364,11 +360,7 @@ public final class RankingProcess extends Thread {
urlhash = obrwi.getElement().metadataHash();
final URIMetadataRow page = this.query.getSegment().urlMetadata().load(urlhash, obrwi.getElement(), obrwi.getWeight());
if (page == null) {
try {
misses.put(obrwi.getElement().metadataHash());
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
misses.add(obrwi.getElement().metadataHash());
continue;
}

@ -155,8 +155,7 @@ public final class SearchEvent {
} else {
// do a local search
this.rankedCache = new RankingProcess(this.query, this.order, max_results_preparation, 2);
this.rankedCache.run();
//CrawlSwitchboard.Finding finding = wordIndex.retrieveURLs(query, false, 2, ranking, process);
this.rankedCache.run(); // this is not started concurrently here on purpose!
if (generateAbstracts) {
// compute index abstracts

@ -1133,6 +1133,7 @@ public final class yacyClient {
}
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("Search Time: " + (System.currentTimeMillis() - time));

@ -422,8 +422,12 @@ public class Domains {
if ((host == null) || (host.length() == 0)) return null;
host = host.toLowerCase().trim();
// try to simply parse the address
InetAddress ip = parseInetAddress(host);
if (ip != null) return ip;
// trying to resolve host by doing a name cache lookup
final InetAddress ip = nameCacheHit.get(host);
ip = nameCacheHit.get(host);
if (ip != null) return ip;
if (nameCacheMiss.containsKey(host)) return null;

Loading…
Cancel
Save