diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index d2d8f326b..a42c0aedd 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -57,15 +57,6 @@ public final class HTTPLoader { * The socket timeout that should be used */ private final int socketTimeout; - - /** - * The maximum allowed file size - */ - //private long maxFileSize = -1; - - //private String acceptEncoding; - //private String acceptLanguage; - //private String acceptCharset; private final Switchboard sb; private final Log log; @@ -119,27 +110,20 @@ public final class HTTPLoader { requestHeader.put(HeaderFramework.ACCEPT_ENCODING, sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING)); // HTTP-Client -// final Client client = new Client(socketTimeout, requestHeader); -// ResponseContainer res = null; final HTTPClient client = new HTTPClient(); client.setTimout(socketTimeout); client.setHeader(requestHeader.entrySet()); -// try { // send request -// res = client.GET(request.url().toString(), maxFileSize); final byte[] responseBody = client.GETbytes(request.url().toString(), maxFileSize); final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); final int code = client.getHttpResponse().getStatusLine().getStatusCode(); // FIXME: 30*-handling (bottom) is never reached // we always get the final content because httpClient.followRedirects = true -// if (res.getStatusCode() == 200 || res.getStatusCode() == 203) { if (responseBody != null && (code == 200 || code == 203)) { // the transfer is ok // we write the new cache entry to file system directly -// res.setAccountingName("CRAWLER"); -// final byte[] responseBody = res.getData(); long contentLength = responseBody.length; ByteCount.addAccountCount(ByteCount.CRAWLER, contentLength); @@ -154,8 +138,6 @@ public final class HTTPLoader { response = new Response( request, requestHeader, -// res.getResponseHeader(), -// res.getStatusLine(), header, Integer.toString(code), mp == null ? null : new CrawlProfile(mp), @@ -163,12 +145,9 @@ public final class HTTPLoader { ); return response; -// } else if (res.getStatusLine().startsWith("30")) { -// if (res.getResponseHeader().containsKey(HeaderFramework.LOCATION)) { } else if (code > 299 && code < 310) { if (header.containsKey(HeaderFramework.LOCATION)) { // getting redirection URL -// String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION); String redirectionUrlString = header.get(HeaderFramework.LOCATION); redirectionUrlString = redirectionUrlString.trim(); @@ -181,7 +160,6 @@ public final class HTTPLoader { final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString)); // restart crawling with new url -// this.log.logInfo("CRAWLER Redirection detected ('" + res.getStatusLine() + "') for URL " + request.url().toString()); this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + request.url().toString()); this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl); @@ -204,17 +182,9 @@ public final class HTTPLoader { } } else { // if the response has not the right response type then reject file -// sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + res.getStatusCode() + ")"); -// throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + request.url().toString()); sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + code + ")"); throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); } -// } finally { -// if(res != null) { -// // release connection -// res.closeStream(); -// } -// } return response; } @@ -251,22 +221,15 @@ public final class HTTPLoader { requestHeader.put(HeaderFramework.ACCEPT_CHARSET, DEFAULT_CHARSET); requestHeader.put(HeaderFramework.ACCEPT_ENCODING, DEFAULT_ENCODING); - // HTTP-Client -// final Client client = new Client(20000, requestHeader); -// ResponseContainer res = null; final HTTPClient client = new HTTPClient(); client.setTimout(20000); client.setHeader(requestHeader.entrySet()); -// try { - // send request -// res = client.GET(request.url().toString(), Long.MAX_VALUE); final byte[] responseBody = client.GETbytes(request.url().toString(), Long.MAX_VALUE); final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); final int code = client.getHttpResponse().getStatusLine().getStatusCode(); // FIXME: 30*-handling (bottom) is never reached // we always get the final content because httpClient.followRedirects = true -// if (res.getStatusCode() == 200 || res.getStatusCode() == 203) { if (responseBody != null && (code == 200 || code == 203)) { // the transfer is ok @@ -274,15 +237,11 @@ public final class HTTPLoader { ByteCount.addAccountCount(ByteCount.CRAWLER, responseBody.length); // we write the new cache entry to file system directly -// res.setAccountingName("CRAWLER"); -// final byte[] responseBody = res.getData(); // create a new cache entry response = new Response( request, requestHeader, -// res.getResponseHeader(), -// res.getStatusLine(), header, Integer.toString(code), null, @@ -290,12 +249,9 @@ public final class HTTPLoader { ); return response; -// } else if (res.getStatusLine().startsWith("30")) { -// if (res.getResponseHeader().containsKey(HeaderFramework.LOCATION)) { } else if (code > 299 && code < 310) { if (header.containsKey(HeaderFramework.LOCATION)) { // getting redirection URL -// String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION); String redirectionUrlString = header.get(HeaderFramework.LOCATION); redirectionUrlString = redirectionUrlString.trim(); @@ -318,15 +274,8 @@ public final class HTTPLoader { } } else { // if the response has not the right response type then reject file -// throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + request.url().toString()); throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); } -// } finally { -// if(res != null) { -// // release connection -// res.closeStream(); -// } -// } return response; } diff --git a/source/de/anomic/http/server/HTTPDFileHandler.java b/source/de/anomic/http/server/HTTPDFileHandler.java index 5a33a11e2..a21999ec3 100644 --- a/source/de/anomic/http/server/HTTPDFileHandler.java +++ b/source/de/anomic/http/server/HTTPDFileHandler.java @@ -1032,22 +1032,7 @@ public final class HTTPDFileHandler { // flush all try {newOut.flush();}catch (final Exception e) {} - - /* - // wait a little time until everything closes so that clients can read from the streams/sockets - if ((contentLength >= 0) && (requestHeader.get(RequestHeader.CONNECTION, "close")).indexOf("keep-alive") == -1) { - // in case that the client knows the size in advance (contentLength present) the waiting will have no effect on the interface performance - // but if the client waits on a connection interruption this will slow down. - try {Thread.sleep(2000);} catch (final InterruptedException e) {} // FIXME: is this necessary? - } - */ } - - // check mime type again using the result array: these are 'magics' -// if (serverByteBuffer.equals(result, 1, "PNG".getBytes())) mimeType = mimeTable.getProperty("png","text/html"); -// else if (serverByteBuffer.equals(result, 0, "GIF89".getBytes())) mimeType = mimeTable.getProperty("gif","text/html"); -// else if (serverByteBuffer.equals(result, 6, "JFIF".getBytes())) mimeType = mimeTable.getProperty("jpg","text/html"); - //System.out.print("MAGIC:"); for (int i = 0; i < 10; i++) System.out.print(Integer.toHexString((int) result[i]) + ","); System.out.println(); } } else { HTTPDemon.sendRespondError(conProp,out,3,404,"File not Found",null,null); @@ -1055,8 +1040,7 @@ public final class HTTPDFileHandler { } } catch (final Exception e) { try { - // doing some errorhandling ... - //Log.logException(e); + // error handling int httpStatusCode = 400; final String httpStatusText = null; final StringBuilder errorMessage = new StringBuilder(2000); diff --git a/source/de/anomic/http/server/HTTPDProxyHandler.java b/source/de/anomic/http/server/HTTPDProxyHandler.java index 570c7e4b2..f5a0a2461 100644 --- a/source/de/anomic/http/server/HTTPDProxyHandler.java +++ b/source/de/anomic/http/server/HTTPDProxyHandler.java @@ -484,21 +484,16 @@ public final class HTTPDProxyHandler { // send request try { -// res = client.GET(getUrl); -// if (log.isFinest()) log.logFinest(reqID +" response status: "+ res.getStatusLine()); client.GET(getUrl); if (log.isFinest()) log.logFinest(reqID +" response status: "+ client.getHttpResponse().getStatusLine()); conProp.put(HeaderFramework.CONNECTION_PROP_CLIENT_REQUEST_HEADER, requestHeader); -// final ResponseHeader responseHeader = res.getResponseHeader(); final ResponseHeader responseHeader = new ResponseHeader(client.getHttpResponse().getAllHeaders()); // determine if it's an internal error of the httpc if (responseHeader.isEmpty()) { -// throw new Exception(res.getStatusLine()); throw new Exception(client.getHttpResponse().getStatusLine().toString()); } -// final ChunkedOutputStream chunkedOut = setTransferEncoding(conProp, responseHeader, res.getStatusCode(), respond); final ChunkedOutputStream chunkedOut = setTransferEncoding(conProp, responseHeader, client.getHttpResponse().getStatusLine().getStatusCode(), respond); // the cache does either not exist or is (supposed to be) stale @@ -539,13 +534,6 @@ public final class HTTPDProxyHandler { } if (log.isFinest()) log.logFinest(reqID +" sending response header: "+ responseHeader); -// HTTPDemon.sendRespondHeader( -// conProp, -// respond, -// httpVer, -// res.getStatusCode(), -// res.getStatusLine().substring(4), // status text -// responseHeader); HTTPDemon.sendRespondHeader( conProp, respond, @@ -554,7 +542,6 @@ public final class HTTPDProxyHandler { client.getHttpResponse().getStatusLine().toString(), // status text responseHeader); -// if (hasBody(res.getStatusCode())) { if (hasBody(client.getHttpResponse().getStatusLine().getStatusCode())) { final OutputStream outStream = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond); @@ -562,7 +549,6 @@ public final class HTTPDProxyHandler { request, requestHeader, responseHeader, -// res.getStatusLine(), Integer.toString(client.getHttpResponse().getStatusLine().getStatusCode()), sb.crawler.defaultProxyProfile ); @@ -940,47 +926,19 @@ public final class HTTPDProxyHandler { if(body == null) { log.logSevere("no body to POST!"); } - // from old httpc: - // "if there is a body to the call, we would have a CONTENT-LENGTH tag in the requestHeader" - // it seems that it is a HTTP/1.1 connection which stays open (the inputStream) and endlessly waits for - // input so we have to end it to do the request - // this should not be needed anymore - see org.apache.http.entity.InputStreamEntity -// final int contentLength = requestHeader.getContentLength(); -// if (contentLength > -1) { -// final byte[] bodyData; -// if(contentLength == 0) { -// // no body -// bodyData = new byte[0]; -// } else { -// // read content-length bytes into memory -// bodyData = new byte[contentLength]; -// int bytes_read = 0; -// while(bytes_read < contentLength) { -// bytes_read += body.read(bodyData, bytes_read, contentLength-bytes_read); -// } -// } -// body = new ByteArrayInputStream(bodyData); -// } -// ResponseContainer res = null; try { // sending the request -// res = client.POST(getUrl, body); -// if (log.isFinest()) log.logFinest(reqID +" response status: "+ res.getStatusLine()); client.POST(getUrl, body, contentLength); if (log.isFinest()) log.logFinest(reqID +" response status: "+ client.getHttpResponse().getStatusLine()); -// final ResponseHeader responseHeader = res.getResponseHeader(); final ResponseHeader responseHeader = new ResponseHeader(client.getHttpResponse().getAllHeaders()); // determine if it's an internal error of the httpc if (responseHeader.isEmpty()) { -// throw new Exception(res.getStatusLine()); throw new Exception(client.getHttpResponse().getStatusLine().toString()); } -// final ChunkedOutputStream chunked = setTransferEncoding(conProp, responseHeader, res.getStatusCode(), countedRespond); final ChunkedOutputStream chunked = setTransferEncoding(conProp, responseHeader, client.getHttpResponse().getStatusLine().getStatusCode(), countedRespond); -// prepareResponseHeader(responseHeader, res.getHttpVer()); prepareResponseHeader(responseHeader, client.getHttpResponse().getProtocolVersion().toString()); // sending the respond header back to the client @@ -990,12 +948,6 @@ public final class HTTPDProxyHandler { // sending response headers if (log.isFinest()) log.logFinest(reqID +" sending response header: "+ responseHeader); -// HTTPDemon.sendRespondHeader(conProp, -// countedRespond, -// httpVer, -// res.getStatusCode(), -// res.getStatusLine().substring(4), // status text -// responseHeader); HTTPDemon.sendRespondHeader(conProp, countedRespond, httpVer, @@ -1003,19 +955,7 @@ public final class HTTPDProxyHandler { client.getHttpResponse().getStatusLine().toString(), // status text responseHeader); - // respondHeader(respond, res.status, res.responseHeader); - // Saver.writeContent(res, (chunked != null) ? new BufferedOutputStream(chunked) : new BufferedOutputStream(respond)); - /* - // *** (Uebernommen aus Saver-Klasse: warum ist dies hier die einzige Methode, die einen OutputStream statt einen Writer benutzt?) - try { - serverFileUtils.copyToStream(new BufferedInputStream(res.getDataAsStream()), (chunked != null) ? new BufferedOutputStream(chunked) : new BufferedOutputStream(respond)); - } finally { - res.closeStream(); - } - if (chunked != null) chunked.finish(); - */ final OutputStream outStream = (chunked != null) ? chunked : countedRespond; -// FileUtils.copy(res.getDataAsStream(), outStream); client.writeTo(outStream); if (chunked != null) { @@ -1024,14 +964,8 @@ public final class HTTPDProxyHandler { outStream.flush(); } catch(SocketException se) { // connection closed by client, abort download -// res.abort(); client.finish(); } finally { - // if opened ... -// if(res != null) { -// // ... close connection -// res.closeStream(); -// } client.finish(); } } catch (final Exception e) { @@ -1118,8 +1052,6 @@ public final class HTTPDProxyHandler { */ private static HTTPClient setupHttpClient(final RequestHeader requestHeader, final String connectHost) { // setup HTTP-client -// final Client client = new Client(timeout, requestHeader); -// client.setFollowRedirects(false); final HTTPClient client = new HTTPClient(); client.setTimout(timeout); client.setHeader(requestHeader.entrySet()); @@ -1293,20 +1225,13 @@ public final class HTTPDProxyHandler { // possibly branch into PROXY-PROXY connection if (ProxySettings.use && ProxySettings.use4ssl) { -// final Client remoteProxy = new Client(timeout, requestHeader); -// remoteProxy.setFollowRedirects(false); // should not be needed, but safe is safe final HTTPClient remoteProxy = setupHttpClient(requestHeader, host); -// ResponseContainer response = null; try { -// response = remoteProxy.CONNECT(host, port); remoteProxy.HEADResponse("http://" + host + ":" + port); ResponseHeader header = new ResponseHeader(remoteProxy.getHttpResponse().getAllHeaders()); // outputs a logline to the serverlog with the current status -// log.logInfo("CONNECT-RESPONSE: status=" + response.getStatusLine() + ", header=" + response.getResponseHeader().toString()); -// // (response.getStatusLine().charAt(0) == '2') || (response.getStatusLine().charAt(0) == '3') -// final boolean success = response.getStatusCode() >= 200 && response.getStatusCode() <= 399; log.logInfo("CONNECT-RESPONSE: status=" + remoteProxy.getHttpResponse().getStatusLine() + ", header=" + header.toString()); final boolean success = remoteProxy.getHttpResponse().getStatusLine().getStatusCode() >= 200 && remoteProxy.getHttpResponse().getStatusLine().getStatusCode() <= 399; if (success) { @@ -1316,7 +1241,6 @@ public final class HTTPDProxyHandler { // go on (see below) } else { // pass error response back to client -// HTTPDemon.sendRespondHeader(conProp,clientOut,httpVersion,response.getStatusCode(),response.getStatusLine().substring(4),response.getResponseHeader()); HTTPDemon.sendRespondHeader( conProp, clientOut, @@ -1328,16 +1252,8 @@ public final class HTTPDProxyHandler { forceConnectionClose(conProp); return; } -// } catch (SocketException se) { -// // connection closed by client, abort download -// response.abort(); } catch (final Exception e) { throw new IOException(e.getMessage()); -// } finally { -// if(response != null) { -// // release connection -// response.closeStream(); -// } } } diff --git a/source/de/anomic/http/server/HTTPDemon.java b/source/de/anomic/http/server/HTTPDemon.java index a5958fa12..751e34271 100644 --- a/source/de/anomic/http/server/HTTPDemon.java +++ b/source/de/anomic/http/server/HTTPDemon.java @@ -802,8 +802,8 @@ public final class HTTPDemon implements serverHandler, Cloneable { * @throws IOException */ @SuppressWarnings("unchecked") - public static Map parseMultipart(final RequestHeader header, final serverObjects args, final InputStream in) - throws IOException { + public static Map parseMultipart(final RequestHeader header, final serverObjects args, final InputStream in) throws IOException { + //ByteArrayInputStream in = new ByteArrayInputStream(FileUtils.read(inx)); final InputStream body = prepareBody(header, in); RequestContext request = new yacyContextRequest(header, body); @@ -821,13 +821,15 @@ public final class HTTPDemon implements serverHandler, Cloneable { // parse data in memory FileUpload upload = new FileUpload(diskFileItemFactory); List items; + long time = System.currentTimeMillis(); try { items = upload.parseRequest(request); } catch (FileUploadException e) { //Log.logException(e); throw new IOException("FileUploadException " + e.getMessage()); } - + System.out.println("**** FileUploadBase.parseRequest time = " + (System.currentTimeMillis() - time)); + // format information for further usage final HashMap files = new HashMap(); for (FileItem item : items) { diff --git a/source/de/anomic/search/RankingProcess.java b/source/de/anomic/search/RankingProcess.java index c82241f64..17b592109 100644 --- a/source/de/anomic/search/RankingProcess.java +++ b/source/de/anomic/search/RankingProcess.java @@ -35,6 +35,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.TreeMap; +import java.util.TreeSet; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; @@ -50,8 +51,6 @@ import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.index.BinSearch; -import net.yacy.kelondro.index.HandleSet; -import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Digest; import net.yacy.kelondro.rwi.ReferenceContainer; @@ -69,9 +68,9 @@ public final class RankingProcess extends Thread { private static final int maxDoubleDomAll = 100, maxDoubleDomSpecial = 10000; private final QueryParams query; - private final HandleSet urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) + private final TreeSet urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) private final int[] flagcount; // flag counter - private final HandleSet misses; // contains url-hashes that could not been found in the LURL-DB + private final TreeSet misses; // contains url-hashes that could not been found in the LURL-DB //private final int[] domZones; private TreeMap> localSearchInclusion; @@ -102,8 +101,10 @@ public final class RankingProcess extends Thread { this.remote_indexCount = 0; this.local_resourceSize = 0; this.local_indexCount = 0; - this.urlhashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); - this.misses = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); + this.urlhashes = new TreeSet(URIMetadataRow.rowdef.objectOrder); + //this.urlhashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); + this.misses = new TreeSet(URIMetadataRow.rowdef.objectOrder); + //this.misses = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); this.flagcount = new int[32]; for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;} this.hostNavigator = new Navigator(); @@ -221,13 +222,8 @@ public final class RankingProcess extends Thread { this.hostNavigator.inc(domhash, uhb); } - // accept; insert to ranked stack with double-check - try { - if (!urlhashes.put(iEntry.metadataHash())) { - stack.put(new ReverseElement(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest) - } - } catch (RowSpaceExceededException e) { - Log.logException(e); + if (urlhashes.add(iEntry.metadataHash())) { + stack.put(new ReverseElement(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest) } // increase counter for statistics @@ -364,11 +360,7 @@ public final class RankingProcess extends Thread { urlhash = obrwi.getElement().metadataHash(); final URIMetadataRow page = this.query.getSegment().urlMetadata().load(urlhash, obrwi.getElement(), obrwi.getWeight()); if (page == null) { - try { - misses.put(obrwi.getElement().metadataHash()); - } catch (RowSpaceExceededException e) { - Log.logException(e); - } + misses.add(obrwi.getElement().metadataHash()); continue; } diff --git a/source/de/anomic/search/SearchEvent.java b/source/de/anomic/search/SearchEvent.java index 3098e857a..91dcf95fa 100644 --- a/source/de/anomic/search/SearchEvent.java +++ b/source/de/anomic/search/SearchEvent.java @@ -155,8 +155,7 @@ public final class SearchEvent { } else { // do a local search this.rankedCache = new RankingProcess(this.query, this.order, max_results_preparation, 2); - this.rankedCache.run(); - //CrawlSwitchboard.Finding finding = wordIndex.retrieveURLs(query, false, 2, ranking, process); + this.rankedCache.run(); // this is not started concurrently here on purpose! if (generateAbstracts) { // compute index abstracts diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index a0aea9dca..82f14352e 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -1133,6 +1133,7 @@ public final class yacyClient { } } } catch (IOException e) { + // TODO Auto-generated catch block e.printStackTrace(); } System.out.println("Search Time: " + (System.currentTimeMillis() - time)); diff --git a/source/net/yacy/cora/protocol/Domains.java b/source/net/yacy/cora/protocol/Domains.java index 9ed2611f9..3a7cd0841 100644 --- a/source/net/yacy/cora/protocol/Domains.java +++ b/source/net/yacy/cora/protocol/Domains.java @@ -422,8 +422,12 @@ public class Domains { if ((host == null) || (host.length() == 0)) return null; host = host.toLowerCase().trim(); + // try to simply parse the address + InetAddress ip = parseInetAddress(host); + if (ip != null) return ip; + // trying to resolve host by doing a name cache lookup - final InetAddress ip = nameCacheHit.get(host); + ip = nameCacheHit.get(host); if (ip != null) return ip; if (nameCacheMiss.containsKey(host)) return null;