From bd63999801cc60f01e18e88ddb09eeeadf14378f Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 7 Feb 2008 22:16:36 +0000 Subject: [PATCH] - faster search: using different data structures that avoid multiplr calculations - no more table copy for error-eco table - optional table copy for lurl-entries - more abstractions (less single constant strings) - better logging (using host names instead of ips) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4459 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/BlogComments.java | 2 +- htroot/CrawlProfileEditor_p.java | 4 +- htroot/Settings_p.java | 2 +- htroot/TestApplet.java | 2 +- htroot/User.java | 6 +-- htroot/ViewImage.java | 2 +- htroot/Wiki.java | 2 +- htroot/www/welcome.java | 2 +- htroot/yacy/hello.java | 2 +- htroot/yacy/search.java | 2 +- htroot/yacy/transfer.java | 4 +- htroot/yacysearch.java | 2 +- source/de/anomic/data/userDB.java | 2 +- source/de/anomic/http/httpSSI.java | 2 +- source/de/anomic/http/httpd.java | 4 +- source/de/anomic/http/httpdFileHandler.java | 16 ++++---- source/de/anomic/index/indexRWIEntry.java | 2 - .../de/anomic/index/indexRWIEntryOrder.java | 23 +++++++++--- source/de/anomic/index/indexRWIRowEntry.java | 20 ++++------ source/de/anomic/index/indexRWIVarEntry.java | 37 ++++++++++++++----- source/de/anomic/index/indexURLEntry.java | 6 +-- .../anomic/kelondro/kelondroSplitTable.java | 2 +- source/de/anomic/plasma/plasmaCrawlLURL.java | 4 +- source/de/anomic/plasma/plasmaCrawlZURL.java | 2 +- .../plasma/plasmaSearchRankingProcess.java | 34 +++++++++-------- .../de/anomic/plasma/plasmaSnippetCache.java | 2 +- .../de/anomic/plasma/plasmaSwitchboard.java | 10 ++--- source/de/anomic/plasma/plasmaWordIndex.java | 4 +- 28 files changed, 113 insertions(+), 89 deletions(-) diff --git a/htroot/BlogComments.java b/htroot/BlogComments.java index 5da72e60b..63f2de9db 100644 --- a/htroot/BlogComments.java +++ b/htroot/BlogComments.java @@ -102,7 +102,7 @@ public class BlogComments { } String pagename = post.get("page", "blog_default"); - String ip = post.get("CLIENTIP", "127.0.0.1"); + String ip = post.get(httpHeader.CONNECTION_PROP_CLIENTIP, "127.0.0.1"); String StrAuthor = post.get("author", "anonymous"); diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java index 06a1651ec..b0088f8fd 100644 --- a/htroot/CrawlProfileEditor_p.java +++ b/htroot/CrawlProfileEditor_p.java @@ -105,9 +105,9 @@ public class CrawlProfileEditor_p { while (it.hasNext()) { selentry = (entry)it.next(); if (selentry.name().equals(plasmaSwitchboard.CRAWL_PROFILE_PROXY) || - selentry.name().equals(plasmaSwitchboard.CRAWL_PROFILE_REMOTE) || + selentry.name().equals(plasmaSwitchboard.CRAWL_PROFILE_REMOTE) /*|| selentry.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_TEXT) || - selentry.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA)) + selentry.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA)*/) continue; prop.put("profiles_" + count + "_name", selentry.name()); prop.put("profiles_" + count + "_handle", selentry.handle()); diff --git a/htroot/Settings_p.java b/htroot/Settings_p.java index bb215f721..086d51283 100644 --- a/htroot/Settings_p.java +++ b/htroot/Settings_p.java @@ -212,7 +212,7 @@ public final class Settings_p { } // clientIP - prop.putHTML("clientIP", (String) header.get("CLIENTIP", ""), true); // read an artificial header addendum + prop.putHTML("clientIP", (String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP, ""), true); // read an artificial header addendum /* * seed upload settings diff --git a/htroot/TestApplet.java b/htroot/TestApplet.java index 439f25e13..1956462fa 100644 --- a/htroot/TestApplet.java +++ b/htroot/TestApplet.java @@ -50,7 +50,7 @@ public class TestApplet { //File templatefile=filehandler.getOverlayedFile((String)post.get("url")); File classfile = httpdFileHandler.getOverlayedClass((String)post.get("url")); httpHeader header2=new httpHeader(); - header2.put("CLIENTIP", "127.0.0.1"); + header2.put(httpHeader.CONNECTION_PROP_CLIENTIP, "127.0.0.1"); header2.put("PATH", post.get("url")); serverObjects tp=null; try { diff --git a/htroot/User.java b/htroot/User.java index e96b88dad..1e0f92ede 100644 --- a/htroot/User.java +++ b/htroot/User.java @@ -79,7 +79,7 @@ public class User{ prop.put("logged-in_identified-by", "2"); //try via ip if(entry == null){ - entry=sb.userDB.ipAuth(((String)header.get("CLIENTIP", "xxxxxx"))); + entry=sb.userDB.ipAuth(((String)header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "xxxxxx"))); if(entry != null){ prop.put("logged-in_identified-by", "0"); } @@ -108,7 +108,7 @@ public class User{ //identified via form-login //TODO: this does not work for a static admin, yet. }else if(post != null && post.containsKey("username") && post.containsKey("password")){ - //entry=sb.userDB.passwordAuth((String)post.get("username"), (String)post.get("password"), (String)header.get("CLIENTIP", "xxxxxx")); + //entry=sb.userDB.passwordAuth((String)post.get("username"), (String)post.get("password"), (String)header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "xxxxxx")); String username=(String)post.get("username"); String password=(String)post.get("password"); @@ -163,7 +163,7 @@ public class User{ if(post!=null && post.containsKey("logout")){ prop.put("logged-in", "0"); if(entry != null){ - entry.logout(((String)header.get("CLIENTIP", "xxxxxx")), userDB.getLoginToken(header.getHeaderCookies())); //todo: logout cookie + entry.logout(((String)header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "xxxxxx")), userDB.getLoginToken(header.getHeaderCookies())); //todo: logout cookie }else{ sb.userDB.adminLogout(userDB.getLoginToken(header.getHeaderCookies())); } diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java index 3178d6979..785a05200 100644 --- a/htroot/ViewImage.java +++ b/htroot/ViewImage.java @@ -72,7 +72,7 @@ public class ViewImage { String urlString = post.get("url", ""); String urlLicense = post.get("code", ""); - boolean auth = ((String) header.get("CLIENTIP", "")).equals("localhost") || sb.verifyAuthentication(header, true); // handle access rights + boolean auth = ((String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "")).equals("localhost") || sb.verifyAuthentication(header, true); // handle access rights yacyURL url = null; if ((urlString.length() > 0) && (auth)) try { diff --git a/htroot/Wiki.java b/htroot/Wiki.java index aed83eca7..26f4e33fc 100644 --- a/htroot/Wiki.java +++ b/htroot/Wiki.java @@ -88,7 +88,7 @@ public class Wiki { String access = switchboard.getConfig("WikiAccess", "admin"); String pagename = post.get("page", "start"); - String ip = post.get("CLIENTIP", "127.0.0.1"); + String ip = post.get(httpHeader.CONNECTION_PROP_CLIENTIP, "127.0.0.1"); String author = post.get("author", "anonymous"); if (author.equals("anonymous")) { author = wikiBoard.guessAuthor(ip); diff --git a/htroot/www/welcome.java b/htroot/www/welcome.java index 101070ac1..92d0aa5a1 100644 --- a/htroot/www/welcome.java +++ b/htroot/www/welcome.java @@ -78,7 +78,7 @@ public class welcome { prop.put("hostip", "Unknown Host Exception"); } prop.put("port", serverCore.getPortNr(env.getConfig("port","8080"))); - prop.put("clientip", (String) header.get("CLIENTIP", "")); + prop.put("clientip", (String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "")); final String peertype = (yacyCore.seedDB.mySeed() == null) ? yacySeed.PEERTYPE_JUNIOR : yacyCore.seedDB.mySeed().get(yacySeed.PEERTYPE, yacySeed.PEERTYPE_VIRGIN); final boolean senior = (peertype.equals(yacySeed.PEERTYPE_SENIOR)) || (peertype.equals(yacySeed.PEERTYPE_PRINCIPAL)); diff --git a/htroot/yacy/hello.java b/htroot/yacy/hello.java index 89c614b4b..a25bb8f9c 100644 --- a/htroot/yacy/hello.java +++ b/htroot/yacy/hello.java @@ -103,7 +103,7 @@ public final class hello { // if ((properTest != null) && (! properTest.substring(0,1).equals("IP"))) { return null; } // we easily know the caller's IP: - final String clientip = (String) header.get("CLIENTIP", ""); // read an artificial header addendum + final String clientip = (String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP, ""); // read an artificial header addendum InetAddress ias = serverDomains.dnsResolve(clientip); if (ias == null) { prop.put("message", "cannot resolve your IP from your reported location " + clientip); diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 254f64668..ff8e2a33e 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -282,7 +282,7 @@ public final class search { // prepare search statistics Long trackerHandle = new Long(System.currentTimeMillis()); HashMap searchProfile = theQuery.resultProfile(joincount, System.currentTimeMillis() - timestamp, urlRetrievalAllTime, snippetComputationAllTime); - String client = (String) header.get("CLIENTIP"); + String client = (String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP); searchProfile.put("host", client); yacySeed remotepeer = yacyCore.seedDB.lookupByIP(natLib.getInetAddress(client), true, false, false); searchProfile.put("peername", (remotepeer == null) ? "unknown" : remotepeer.getName()); diff --git a/htroot/yacy/transfer.java b/htroot/yacy/transfer.java index 68970733c..c3e5a3fb1 100644 --- a/htroot/yacy/transfer.java +++ b/htroot/yacy/transfer.java @@ -89,14 +89,14 @@ public final class transfer { final yacySeed opeer = yacyCore.seedDB.get(ohash); if (opeer == null) { // reject unknown peers: this does not appear fair, but anonymous senders are dangerous - sb.getLog().logFine("RankingTransmission: rejected unknown peer '" + ohash + "', current IP " + header.get("CLIENTIP", "unknown")); + sb.getLog().logFine("RankingTransmission: rejected unknown peer '" + ohash + "', current IP " + header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "unknown")); return prop; } opeer.setLastSeenUTC(); if (filename.indexOf("..") >= 0) { // reject paths that contain '..' because they are dangerous - sb.getLog().logFine("RankingTransmission: rejected wrong path '" + filename + "' from peer " + opeer.getName() + "/" + opeer.getPublicAddress()+ ", current IP " + header.get("CLIENTIP", "unknown")); + sb.getLog().logFine("RankingTransmission: rejected wrong path '" + filename + "' from peer " + opeer.getName() + "/" + opeer.getPublicAddress()+ ", current IP " + header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "unknown")); return prop; } diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index fc2628284..322acd7ab 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -257,7 +257,7 @@ public class yacysearch { constraint, true); - String client = (String) header.get("CLIENTIP"); // the search client who initiated the search + String client = (String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP); // the search client who initiated the search // tell all threads to do nothing for a specific time sb.intermissionAllThreads(10000); diff --git a/source/de/anomic/data/userDB.java b/source/de/anomic/data/userDB.java index ec3989eee..19ed2c315 100644 --- a/source/de/anomic/data/userDB.java +++ b/source/de/anomic/data/userDB.java @@ -155,7 +155,7 @@ public final class userDB { return null; } public Entry getUser(httpHeader header){ - return getUser((String) header.get(httpHeader.AUTHORIZATION), (String)header.get("CLIENTIP"), header.getHeaderCookies()); + return getUser((String) header.get(httpHeader.AUTHORIZATION), (String)header.get(httpHeader.CONNECTION_PROP_CLIENTIP), header.getHeaderCookies()); } public Entry getUser(String auth, String ip, String cookies){ Entry entry=null; diff --git a/source/de/anomic/http/httpSSI.java b/source/de/anomic/http/httpSSI.java index 4b083e41b..f2fa36bc5 100644 --- a/source/de/anomic/http/httpSSI.java +++ b/source/de/anomic/http/httpSSI.java @@ -85,7 +85,7 @@ public class httpSSI { conProp.setProperty(httpHeader.CONNECTION_PROP_PATH, path); conProp.setProperty(httpHeader.CONNECTION_PROP_ARGS, args); conProp.setProperty(httpHeader.CONNECTION_PROP_HTTP_VER, httpHeader.HTTP_VERSION_0_9); - conProp.setProperty("CLIENTIP", "127.0.0.1"); + conProp.setProperty(httpHeader.CONNECTION_PROP_CLIENTIP, "127.0.0.1"); header.put(httpHeader.AUTHORIZATION, authorization); httpdFileHandler.doGet(conProp, header, out); } diff --git a/source/de/anomic/http/httpd.java b/source/de/anomic/http/httpd.java index ebd028f88..e300ad718 100644 --- a/source/de/anomic/http/httpd.java +++ b/source/de/anomic/http/httpd.java @@ -193,7 +193,7 @@ public final class httpd implements serverHandler { public void initSession(serverCore.Session newsession) throws IOException { this.session = newsession; this.userAddress = session.userAddress; // client InetAddress - this.clientIP = this.userAddress.getHostAddress(); + this.clientIP = this.userAddress.getHostName(); if (this.userAddress.isAnyLocalAddress()) this.clientIP = "localhost"; if (this.clientIP.equals("0:0:0:0:0:0:0:1")) this.clientIP = "localhost"; if (this.clientIP.equals("127.0.0.1")) this.clientIP = "localhost"; @@ -1147,7 +1147,7 @@ public final class httpd implements serverHandler { // tp.put("host", serverCore.publicIP().getHostAddress()); // tp.put("port", switchboard.getConfig("port", "8080")); - String clientIP = conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP,"127.0.0.1"); + String clientIP = conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP, "127.0.0.1"); // check if ip is local ip address InetAddress hostAddress = serverDomains.dnsResolve(clientIP); diff --git a/source/de/anomic/http/httpdFileHandler.java b/source/de/anomic/http/httpdFileHandler.java index a16abdaab..3d5bc15fe 100644 --- a/source/de/anomic/http/httpdFileHandler.java +++ b/source/de/anomic/http/httpdFileHandler.java @@ -303,13 +303,13 @@ public final class httpdFileHandler { if ((path.substring(0,(pos==-1)?path.length():pos)).endsWith("_p") && (adminAccountBase64MD5.length() != 0)) { //authentication required //userDB - if(sb.userDB.hasAdminRight(authorization, conProp.getProperty("CLIENTIP"), requestHeader.getHeaderCookies())){ + if(sb.userDB.hasAdminRight(authorization, conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP), requestHeader.getHeaderCookies())){ //Authentication successful. remove brute-force flag - serverCore.bfHost.remove(conProp.getProperty("CLIENTIP")); + serverCore.bfHost.remove(conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP)); //static }else if(authorization != null && httpd.staticAdminAuthenticated(authorization.trim().substring(6), switchboard)==4){ //Authentication successful. remove brute-force flag - serverCore.bfHost.remove(conProp.getProperty("CLIENTIP")); + serverCore.bfHost.remove(conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP)); //no auth }else if (authorization == null) { // no authorization given in response. Ask for that @@ -323,7 +323,7 @@ public final class httpdFileHandler { return; } else { // a wrong authentication was given or the userDB user does not have admin access. Ask again - String clientIP = conProp.getProperty("CLIENTIP", "unknown-host"); + String clientIP = conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP, "unknown-host"); serverLog.logInfo("HTTPD", "Wrong log-in for account 'admin' in http file handler for path '" + path + "' from host '" + clientIP + "'"); Integer attempts = (Integer) serverCore.bfHost.get(clientIP); if (attempts == null) @@ -473,7 +473,7 @@ public final class httpdFileHandler { // call an image-servlet to produce an on-the-fly - generated image Object img = null; try { - requestHeader.put(httpHeader.CONNECTION_PROP_CLIENTIP, conProp.getProperty("CLIENTIP")); + requestHeader.put(httpHeader.CONNECTION_PROP_CLIENTIP, conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP)); requestHeader.put(httpHeader.CONNECTION_PROP_PATH, path); // in case that there are no args given, args = null or empty hashmap img = invokeServlet(targetClass, requestHeader, args); @@ -527,7 +527,7 @@ public final class httpdFileHandler { } } else if ((targetClass != null) && (path.endsWith(".stream"))) { // call rewrite-class - requestHeader.put(httpHeader.CONNECTION_PROP_CLIENTIP, conProp.getProperty("CLIENTIP")); + requestHeader.put(httpHeader.CONNECTION_PROP_CLIENTIP, conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP)); requestHeader.put(httpHeader.CONNECTION_PROP_PATH, path); //requestHeader.put(httpHeader.CONNECTION_PROP_INPUTSTREAM, body); //requestHeader.put(httpHeader.CONNECTION_PROP_OUTPUTSTREAM, out); @@ -570,7 +570,7 @@ public final class httpdFileHandler { } else { // CGI-class: call the class to create a property for rewriting try { - requestHeader.put(httpHeader.CONNECTION_PROP_CLIENTIP, conProp.getProperty("CLIENTIP")); + requestHeader.put(httpHeader.CONNECTION_PROP_CLIENTIP, conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP)); requestHeader.put(httpHeader.CONNECTION_PROP_PATH, path); // in case that there are no args given, args = null or empty hashmap Object tmp = invokeServlet(targetClass, requestHeader, args); @@ -586,7 +586,7 @@ public final class httpdFileHandler { if (tp.containsKey(servletProperties.ACTION_AUTHENTICATE)) { // handle brute-force protection if (authorization != null) { - String clientIP = conProp.getProperty("CLIENTIP", "unknown-host"); + String clientIP = conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP, "unknown-host"); serverLog.logInfo("HTTPD", "dynamic log-in for account 'admin' in http file handler for path '" + path + "' from host '" + clientIP + "'"); Integer attempts = (Integer) serverCore.bfHost.get(clientIP); if (attempts == null) diff --git a/source/de/anomic/index/indexRWIEntry.java b/source/de/anomic/index/indexRWIEntry.java index b21a4aa3f..15d48fc57 100644 --- a/source/de/anomic/index/indexRWIEntry.java +++ b/source/de/anomic/index/indexRWIEntry.java @@ -47,8 +47,6 @@ public interface indexRWIEntry { public String urlHash(); - public int quality(); - public int virtualAge(); public long lastModified(); diff --git a/source/de/anomic/index/indexRWIEntryOrder.java b/source/de/anomic/index/indexRWIEntryOrder.java index e8112659c..e25260d00 100644 --- a/source/de/anomic/index/indexRWIEntryOrder.java +++ b/source/de/anomic/index/indexRWIEntryOrder.java @@ -26,6 +26,7 @@ package de.anomic.index; +import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.Map; @@ -55,12 +56,13 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder this.maxdomcount = 0; } - public void normalizeWith(indexContainer container) { + public ArrayList normalizeWith(indexContainer container) { // normalize ranking: find minimum and maxiumum of separate ranking criteria assert (container != null); - + ArrayList result = null; + //long s0 = System.currentTimeMillis(); - if ((processors > 1) && (container.size() > 10000)) { + if ((processors > 1) && (container.size() > 600)) { // run minmax with two threads int middle = container.size() / 2; minmaxfinder mmf0 = new minmaxfinder(container, 0, middle); @@ -83,6 +85,8 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder entry = di.next(); this.doms.addScore(entry.getKey(), ((Integer) entry.getValue()).intValue()); } + result = mmf0.decodedEntries; + result.addAll(mmf1.decodedContainer()); //long s1= System.currentTimeMillis(), sc = Math.max(1, s1 - s0); //System.out.println("***DEBUG*** indexRWIEntry.Order (2-THREADED): " + sc + " milliseconds for " + container.size() + " entries, " + (container.size() / sc) + " entries/millisecond"); } else if (container.size() > 0) { @@ -97,10 +101,12 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder entry = di.next(); this.doms.addScore(entry.getKey(), ((Integer) entry.getValue()).intValue()); } + result = mmf.decodedContainer(); //long s1= System.currentTimeMillis(), sc = Math.max(1, s1 - s0); //System.out.println("***DEBUG*** indexRWIEntry.Order (ONETHREAD): " + sc + " milliseconds for " + container.size() + " entries, " + (container.size() / sc) + " entries/millisecond"); } if (this.doms.size() > 0) this.maxdomcount = this.doms.getMaxScore(); + return result; } public kelondroOrder clone() { @@ -179,6 +185,7 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder private int start, end; private HashMap doms; private Integer int1; + ArrayList decodedEntries; public minmaxfinder(indexContainer container, int start /*including*/, int end /*excluding*/) { this.container = container; @@ -186,18 +193,20 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder this.end = end; this.doms = new HashMap(); this.int1 = new Integer(1); + this.decodedEntries = new ArrayList(); } public void run() { // find min/max to obtain limits for normalization this.entryMin = null; this.entryMax = null; - indexRWIRowEntry iEntry; + indexRWIVarEntry iEntry; int p = this.start; String dom; Integer count; while (p < this.end) { - iEntry = new indexRWIRowEntry(container.get(p++)); + iEntry = new indexRWIVarEntry(new indexRWIRowEntry(container.get(p++))); + this.decodedEntries.add(iEntry); // find min/max if (this.entryMin == null) this.entryMin = new indexRWIVarEntry(iEntry); else indexRWIVarEntry.min(this.entryMin, iEntry); if (this.entryMax == null) this.entryMax = new indexRWIVarEntry(iEntry); else indexRWIVarEntry.max(this.entryMax, iEntry); @@ -212,6 +221,10 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder } } + public ArrayList decodedContainer() { + return this.decodedEntries; + } + public HashMap domcount() { return this.doms; } diff --git a/source/de/anomic/index/indexRWIRowEntry.java b/source/de/anomic/index/indexRWIRowEntry.java index d89a8ea7f..f4d6867a9 100644 --- a/source/de/anomic/index/indexRWIRowEntry.java +++ b/source/de/anomic/index/indexRWIRowEntry.java @@ -88,6 +88,8 @@ public final class indexRWIRowEntry implements indexRWIEntry { private static final int col_worddistance = 18; // i 1 initial zero; may be used as reserve: is filled during search private static final int col_reserve = 19; // k 1 reserve + public double termFrequency; + private kelondroRow.Entry entry; public indexRWIRowEntry(String urlHash, @@ -101,14 +103,14 @@ public final class indexRWIRowEntry implements indexRWIEntry { int posinphrase, // position of word in its phrase int posofphrase, // number of the phrase where word appears int worddistance, // word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search - int sizeOfPage, // # of bytes of the page TODO: not needed any more long lastmodified, // last-modified time of the document where word appears long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short String language, // (guessed) language of document char doctype, // type of document int outlinksSame, // outlinks to same domain int outlinksOther, // outlinks to other domain - kelondroBitfield flags // attributes to the url and to the word according the url + kelondroBitfield flags, // attributes to the url and to the word according the url + double termFrequency ) { assert (urlHash.length() == 12) : "urlhash = " + urlHash; @@ -136,6 +138,7 @@ public final class indexRWIRowEntry implements indexRWIEntry { this.entry.setCol(col_posofphrase, posofphrase); this.entry.setCol(col_worddistance, worddistance); this.entry.setCol(col_reserve, 0); + this.termFrequency = termFrequency; } public indexRWIRowEntry(String urlHash, String code) { @@ -183,10 +186,6 @@ public final class indexRWIRowEntry implements indexRWIEntry { return this.entry.getColString(col_urlhash, null); } - public int quality() { - return 0; // not used any more - } - public int virtualAge() { return (int) this.entry.getColLong(col_lastModified); // this is the time in MicoDateDays format } @@ -256,7 +255,8 @@ public final class indexRWIRowEntry implements indexRWIEntry { } public double termFrequency() { - return (((double) this.hitcount()) / ((double) (this.wordsintext() + this.wordsintitle() + 1))); + if (this.termFrequency == 0.0) this.termFrequency = (((double) this.hitcount()) / ((double) (this.wordsintext() + this.wordsintitle() + 1))); + return this.termFrequency; } public String toString() { @@ -288,18 +288,12 @@ public final class indexRWIRowEntry implements indexRWIEntry { public boolean isNewer(indexRWIEntry other) { if (other == null) return true; if (this.lastModified() > other.lastModified()) return true; - if (this.lastModified() == other.lastModified()) { - if (this.quality() > other.quality()) return true; - } return false; } public boolean isOlder(indexRWIEntry other) { if (other == null) return false; if (this.lastModified() < other.lastModified()) return true; - if (this.lastModified() == other.lastModified()) { - if (this.quality() < other.quality()) return true; - } return false; } diff --git a/source/de/anomic/index/indexRWIVarEntry.java b/source/de/anomic/index/indexRWIVarEntry.java index 854eeab0d..68b430efa 100644 --- a/source/de/anomic/index/indexRWIVarEntry.java +++ b/source/de/anomic/index/indexRWIVarEntry.java @@ -37,7 +37,7 @@ public class indexRWIVarEntry implements indexRWIEntry { public char type; public int hitcount, llocal, lother, phrasesintext, posintext, posinphrase, posofphrase, - quality, urlcomps, urllength, virtualAge, + urlcomps, urllength, virtualAge, worddistance, wordsintext, wordsintitle; public double termFrequency; @@ -55,7 +55,6 @@ public class indexRWIVarEntry implements indexRWIEntry { this.posintext = e.posintext(); this.posinphrase = e.posinphrase(); this.posofphrase = e.posofphrase(); - this.quality = e.quality(); this.urlcomps = e.urlcomps(); this.urllength = e.urllength(); this.virtualAge = e.virtualAge(); @@ -133,9 +132,29 @@ public class indexRWIVarEntry implements indexRWIEntry { public int posofphrase() { return posofphrase; } - - public int quality() { - return quality; + + private indexRWIRowEntry toRowEntry() { + return new indexRWIRowEntry( + urlHash, + urllength, // byte-length of complete URL + urlcomps, // number of path components + wordsintitle, // length of description/length (longer are better?) + hitcount, // how often appears this word in the text + wordsintext, // total number of words + phrasesintext, // total number of phrases + posintext, // position of word in all words + posinphrase, // position of word in its phrase + posofphrase, // number of the phrase where word appears + worddistance, // word distance + lastModified, // last-modified time of the document where word appears + System.currentTimeMillis(), // update time; + language, // (guessed) language of document + type, // type of document + llocal, // outlinks to same domain + lother, // outlinks to other domain + flags, // attributes to the url and to the word according the url + termFrequency + ); } public Entry toKelondroEntry() { @@ -144,8 +163,7 @@ public class indexRWIVarEntry implements indexRWIEntry { } public String toPropertyForm() { - assert false; // should not be used - return null; + return toRowEntry().toPropertyForm(); } public String urlHash() { @@ -177,7 +195,8 @@ public class indexRWIVarEntry implements indexRWIEntry { } public double termFrequency() { - return termFrequency; + if (this.termFrequency == 0.0) this.termFrequency = (((double) this.hitcount()) / ((double) (this.wordsintext() + this.wordsintitle() + 1))); + return this.termFrequency; } public static final void min(indexRWIVarEntry t, indexRWIEntry other) { @@ -187,7 +206,6 @@ public class indexRWIVarEntry implements indexRWIEntry { if (t.hitcount() > (v = other.hitcount())) t.hitcount = v; if (t.llocal() > (v = other.llocal())) t.llocal = v; if (t.lother() > (v = other.lother())) t.lother = v; - if (t.quality() > (v = other.quality())) t.quality = v; if (t.virtualAge() > (v = other.virtualAge())) t.virtualAge = v; if (t.wordsintext() > (v = other.wordsintext())) t.wordsintext = v; if (t.phrasesintext() > (v = other.phrasesintext())) t.phrasesintext = v; @@ -210,7 +228,6 @@ public class indexRWIVarEntry implements indexRWIEntry { if (t.hitcount() < (v = other.hitcount())) t.hitcount = v; if (t.llocal() < (v = other.llocal())) t.llocal = v; if (t.lother() < (v = other.lother())) t.lother = v; - if (t.quality() < (v = other.quality())) t.quality = v; if (t.virtualAge() < (v = other.virtualAge())) t.virtualAge = v; if (t.wordsintext() < (v = other.wordsintext())) t.wordsintext = v; if (t.phrasesintext() < (v = other.phrasesintext())) t.phrasesintext = v; diff --git a/source/de/anomic/index/indexURLEntry.java b/source/de/anomic/index/indexURLEntry.java index a3877f2bb..3c40eb13b 100644 --- a/source/de/anomic/index/indexURLEntry.java +++ b/source/de/anomic/index/indexURLEntry.java @@ -115,7 +115,7 @@ public class indexURLEntry { private kelondroRow.Entry entry; private String snippet; - private indexRWIRowEntry word; // this is only used if the url is transported via remote search requests + private indexRWIEntry word; // this is only used if the url is transported via remote search requests private long ranking; // during generation of a search result this value is set public indexURLEntry( @@ -185,7 +185,7 @@ public class indexURLEntry { return s.toString().getBytes(); } - public indexURLEntry(kelondroRow.Entry entry, indexRWIRowEntry searchedWord, long ranking) { + public indexURLEntry(kelondroRow.Entry entry, indexRWIEntry searchedWord, long ranking) { this.entry = entry; this.snippet = null; this.word = searchedWord; @@ -391,7 +391,7 @@ public class indexURLEntry { return snippet; } - public indexRWIRowEntry word() { + public indexRWIEntry word() { return word; } diff --git a/source/de/anomic/kelondro/kelondroSplitTable.java b/source/de/anomic/kelondro/kelondroSplitTable.java index 0297b432f..36be1c277 100644 --- a/source/de/anomic/kelondro/kelondroSplitTable.java +++ b/source/de/anomic/kelondro/kelondroSplitTable.java @@ -119,7 +119,7 @@ public class kelondroSplitTable implements kelondroIndex { // this is a kelonodroFlex table table = new kelondroCache(new kelondroFlexTable(path, maxf, preloadTime, rowdef, 0, resetOnFail)); } else { - table = new kelondroEcoTable(f, rowdef, kelondroEcoTable.tailCacheDenyUsage, EcoFSBufferSize, 0); + table = new kelondroEcoTable(f, rowdef, kelondroEcoTable.tailCacheUsageAuto, EcoFSBufferSize, 0); } tables.put(date, table); } diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 45b07940d..134456d6d 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -66,7 +66,7 @@ import java.util.LinkedList; import de.anomic.data.htmlTools; import de.anomic.http.httpc; import de.anomic.http.httpc.response; -import de.anomic.index.indexRWIRowEntry; +import de.anomic.index.indexRWIEntry; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroCache; @@ -153,7 +153,7 @@ public final class plasmaCrawlLURL { return 0; } - public synchronized indexURLEntry load(String urlHash, indexRWIRowEntry searchedWord, long ranking) { + public synchronized indexURLEntry load(String urlHash, indexRWIEntry searchedWord, long ranking) { // generates an plasmaLURLEntry using the url hash // to speed up the access, the url-hashes are buffered // in the hash cache. diff --git a/source/de/anomic/plasma/plasmaCrawlZURL.java b/source/de/anomic/plasma/plasmaCrawlZURL.java index c6b0d0b83..38901ede0 100644 --- a/source/de/anomic/plasma/plasmaCrawlZURL.java +++ b/source/de/anomic/plasma/plasmaCrawlZURL.java @@ -69,7 +69,7 @@ public class plasmaCrawlZURL { if (f.isDirectory()) kelondroFlexTable.delete(cachePath, tablename); else f.delete(); } } - urlIndex = new kelondroEcoTable(f, rowdef, kelondroEcoTable.tailCacheUsageAuto, EcoFSBufferSize, 0); + urlIndex = new kelondroEcoTable(f, rowdef, kelondroEcoTable.tailCacheDenyUsage, EcoFSBufferSize, 0); //urlIndex = new kelondroFlexTable(cachePath, tablename, -1, rowdef, 0, true); } diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java index 9ec46d613..a5e8139e7 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java @@ -28,6 +28,7 @@ package de.anomic.plasma; import java.io.File; import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.Map; @@ -40,6 +41,7 @@ import de.anomic.index.indexContainer; import de.anomic.index.indexRWIEntry; import de.anomic.index.indexRWIEntryOrder; import de.anomic.index.indexRWIRowEntry; +import de.anomic.index.indexRWIVarEntry; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBinSearch; import de.anomic.kelondro.kelondroMScoreCluster; @@ -52,8 +54,8 @@ public final class plasmaSearchRankingProcess { public static kelondroBinSearch[] ybrTables = null; // block-rank tables private static boolean useYBR = true; - private TreeMap sortedRWIEntries; // key = ranking (Long); value = indexRWIEntry; if sortorder < 2 then key is instance of String - private HashMap> doubleDomCache; // key = domhash (6 bytes); value = TreeMap like sortedRWIEntries + private TreeMap sortedRWIEntries; // key = ranking (Long); value = indexRWIEntry; if sortorder < 2 then key is instance of String + private HashMap> doubleDomCache; // key = domhash (6 bytes); value = TreeMap like sortedRWIEntries private HashMap handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process private plasmaSearchQuery query; private int sortorder; @@ -72,8 +74,8 @@ public final class plasmaSearchRankingProcess { // attention: if minEntries is too high, this method will not terminate within the maxTime // sortorder: 0 = hash, 1 = url, 2 = ranking this.localSearchContainerMaps = null; - this.sortedRWIEntries = new TreeMap(); - this.doubleDomCache = new HashMap>(); + this.sortedRWIEntries = new TreeMap(); + this.doubleDomCache = new HashMap>(); this.handover = new HashMap(); this.order = null; this.query = query; @@ -132,11 +134,11 @@ public final class plasmaSearchRankingProcess { this.remote_indexCount += index.size(); } - indexRWIRowEntry ientry; + indexRWIVarEntry ientry; indexURLEntry uentry; String u; loop: while (en.hasNext()) { - ientry = en.next(); + ientry = new indexRWIVarEntry(en.next()); // check constraints if (!testFlags(ientry)) continue loop; @@ -183,13 +185,13 @@ public final class plasmaSearchRankingProcess { if (this.order == null) { this.order = new indexRWIEntryOrder(query.ranking); } - this.order.normalizeWith(index); + ArrayList decodedEntries = this.order.normalizeWith(index); serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.NORMALIZING, index.size(), System.currentTimeMillis() - timer)); // normalize entries and get ranking timer = System.currentTimeMillis(); - Iterator i = index.entries(); - indexRWIRowEntry iEntry, l; + Iterator i = decodedEntries.iterator(); + indexRWIVarEntry iEntry, l; long biggestEntry = 0; //long s0 = System.currentTimeMillis(); Long r; @@ -272,8 +274,8 @@ public final class plasmaSearchRankingProcess { private synchronized Object[] /*{Object, indexRWIEntry}*/ bestRWI(boolean skipDoubleDom) { // returns from the current RWI list the best entry and removed this entry from the list Object bestEntry; - TreeMap m; - indexRWIRowEntry rwi; + TreeMap m; + indexRWIVarEntry rwi; while (sortedRWIEntries.size() > 0) { bestEntry = sortedRWIEntries.firstKey(); rwi = sortedRWIEntries.remove(bestEntry); @@ -283,7 +285,7 @@ public final class plasmaSearchRankingProcess { m = this.doubleDomCache.get(domhash); if (m == null) { // first appearance of dom - m = new TreeMap(); + m = new TreeMap(); this.doubleDomCache.put(domhash, m); return new Object[]{bestEntry, rwi}; } @@ -292,10 +294,10 @@ public final class plasmaSearchRankingProcess { } // no more entries in sorted RWI entries. Now take Elements from the doubleDomCache // find best entry from all caches - Iterator> i = this.doubleDomCache.values().iterator(); + Iterator> i = this.doubleDomCache.values().iterator(); bestEntry = null; Object o; - indexRWIRowEntry bestrwi = null; + indexRWIVarEntry bestrwi = null; while (i.hasNext()) { m = i.next(); if (m.size() == 0) continue; @@ -331,7 +333,7 @@ public final class plasmaSearchRankingProcess { while ((sortedRWIEntries.size() > 0) || (size() > 0)) { Object[] obrwi = bestRWI(skipDoubleDom); Object bestEntry = obrwi[0]; - indexRWIRowEntry ientry = (indexRWIRowEntry) obrwi[1]; + indexRWIVarEntry ientry = (indexRWIVarEntry) obrwi[1]; long ranking = (bestEntry instanceof Long) ? ((Long) bestEntry).longValue() : 0; indexURLEntry u = wordIndex.loadedURL.load(ientry.urlHash(), ientry, ranking); if (u != null) { @@ -347,7 +349,7 @@ public final class plasmaSearchRankingProcess { public synchronized int size() { //assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size(); int c = sortedRWIEntries.size(); - Iterator> i = this.doubleDomCache.values().iterator(); + Iterator> i = this.doubleDomCache.values().iterator(); while (i.hasNext()) c += i.next().size(); return c; } diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 846e748c4..8e07c28b5 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -414,7 +414,7 @@ public class plasmaSnippetCache { resInfo = entry.getDocumentInfo(); // read resource body (if it is there) - byte []resourceArray = entry.cacheArray(); + byte[] resourceArray = entry.cacheArray(); if (resourceArray != null) { resContent = new ByteArrayInputStream(resourceArray); resContentLength = resourceArray.length; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 2b99668e1..97b626c25 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -906,7 +906,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } catch (MalformedURLException e) { } } else { - File networkUnitDefinitionFile = new File(rootPath, networkUnitDefinition); + File networkUnitDefinitionFile = (networkUnitDefinition.startsWith("/")) ? new File(networkUnitDefinition) : new File(rootPath, networkUnitDefinition); if (networkUnitDefinitionFile.exists()) { initProps = serverFileUtils.loadHashMap(networkUnitDefinitionFile); this.setConfig(initProps); @@ -2348,14 +2348,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser wordStat.posInPhrase, wordStat.numOfPhrase, 0, - newEntry.size(), docDate.getTime(), System.currentTimeMillis(), language, doctype, ioLinks[0].intValue(), ioLinks[1].intValue(), - condenser.RESULT_FLAGS + condenser.RESULT_FLAGS, + 0.0 ); indexContainer wordIdxContainer = plasmaWordIndex.emptyContainer(wordHash, 1); wordIdxContainer.add(wordIdxEntry); @@ -2573,10 +2573,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (authorization.length() > 256) return 0; // authorization by encoded password, only for localhost access - if ((((String) header.get("CLIENTIP", "")).equals("localhost")) && (adminAccountBase64MD5.equals(authorization))) return 3; // soft-authenticated for localhost + if ((((String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "")).equals("localhost")) && (adminAccountBase64MD5.equals(authorization))) return 3; // soft-authenticated for localhost // authorization by hit in userDB - if (userDB.hasAdminRight((String) header.get(httpHeader.AUTHORIZATION, "xxxxxx"), ((String) header.get("CLIENTIP", "")), header.getHeaderCookies())) return 4; //return, because 4=max + if (userDB.hasAdminRight((String) header.get(httpHeader.AUTHORIZATION, "xxxxxx"), ((String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "")), header.getHeaderCookies())) return 4; //return, because 4=max // authorization with admin keyword in configuration return httpd.staticAdminAuthenticated(authorization, this); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 4020bd885..3f5fcf62f 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -314,13 +314,13 @@ public final class plasmaWordIndex implements indexRI { wprop.posInPhrase, wprop.numOfPhrase, 0, - size, urlModified.getTime(), System.currentTimeMillis(), language, doctype, outlinksSame, outlinksOther, - wprop.flags); + wprop.flags, + 0.0); addEntry(plasmaCondenser.word2hash(word), ientry, System.currentTimeMillis(), false); wordCount++; }