diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 8c99a2f68..3a6041a77 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -117,13 +117,6 @@ public class yacysearch { } // collect search attributes - int maxDistance = Integer.MAX_VALUE; - - if ((querystring.length() > 2) && (querystring.charAt(0) == '"') && (querystring.charAt(querystring.length() - 1) == '"')) { - querystring = querystring.substring(1, querystring.length() - 1).trim(); - maxDistance = 1; - } - int itemsPerPage = Math.min((authenticated) ? 1000 : 10, post.getInt("maximumRecords", post.getInt("count", 10))); // SRU syntax with old property as alternative int offset = (post.hasValue("query") && post.hasValue("former") && !post.get("query","").equalsIgnoreCase(post.get("former",""))) ? 0 : post.getInt("startRecord", post.getInt("offset", 0)); @@ -181,6 +174,7 @@ public class yacysearch { } if ((!block) && (post == null || post.get("cat", "href").equals("href"))) { + // check available memory and clean up if necessary if (!serverMemory.request(8000000L, false)) { sb.webIndex.clearCache(); @@ -197,7 +191,9 @@ public class yacysearch { query[0].remove("recent"); ranking.coeff_date = plasmaSearchRankingProfile.COEFF_MAX; } - + + int maxDistance = (querystring.indexOf('"') >= 0) ? maxDistance = query.length - 1 : Integer.MAX_VALUE; + // filter out stopwords final TreeSet filtered = kelondroMSetTools.joinConstructive(query[0], plasmaSwitchboard.stopwords); if (filtered.size() > 0) { diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index 52c7b0f67..9d66fe260 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -493,7 +493,7 @@ public class Balancer { 15000, Math.max( (crawlEntry.url().isLocal()) ? minimumLocalDelta : minimumGlobalDelta, - plasmaSwitchboard.getSwitchboard().robots.crawlDelay(crawlEntry.url()) * 1000) + plasmaSwitchboard.getSwitchboard().robots.crawlDelayMillis(crawlEntry.url())) ); // prevent that that robots file can stop our indexer completely if (delta < genericDelta) { // force a busy waiting here diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index 7f9fad4b8..78657f40f 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -208,7 +208,7 @@ public class RobotsTxt { (Date) result[DOWNLOAD_MODDATE], (String) result[DOWNLOAD_ETAG], parserResult.sitemap(), - parserResult.crawlDelay()); + parserResult.crawlDelayMillis()); } } } @@ -216,10 +216,10 @@ public class RobotsTxt { return robotsTxt4Host; } - public int crawlDelay(final yacyURL theURL) { + public long crawlDelayMillis(final yacyURL theURL) { final String urlHostPort = getHostPort(theURL); final RobotsTxt.Entry robotsEntry = getEntry(urlHostPort, true); - return robotsEntry.getCrawlDelay(); + return robotsEntry.getCrawlDelayMillis(); } private Entry addEntry( @@ -230,11 +230,11 @@ public class RobotsTxt { final Date modDate, final String eTag, final String sitemap, - final int crawlDelay + final long crawlDelayMillis ) { final Entry entry = new Entry( hostName, allowPathList, denyPathList, loadedDate, modDate, - eTag, sitemap, crawlDelay); + eTag, sitemap, crawlDelayMillis); addEntry(entry); return entry; } @@ -257,6 +257,7 @@ public class RobotsTxt { public static final String ETAG = "etag"; public static final String SITEMAP = "sitemap"; public static final String CRAWL_DELAY = "crawlDelay"; + public static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis"; // this is a simple record structure that holds all properties of a single crawl start HashMap mem; @@ -301,7 +302,7 @@ public class RobotsTxt { final Date modDate, final String eTag, final String sitemap, - final int crawlDelay + final long crawlDelayMillis ) { if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException("The hostname is missing"); @@ -314,7 +315,7 @@ public class RobotsTxt { if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime())); if (eTag != null) this.mem.put(ETAG,eTag); if (sitemap != null) this.mem.put(SITEMAP,sitemap); - if (crawlDelay != 0) this.mem.put(CRAWL_DELAY, Integer.toString(crawlDelay)); + if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, Long.toString(crawlDelayMillis)); if ((allowPathList != null)&&(allowPathList.size()>0)) { this.allowPathList.addAll(allowPathList); @@ -382,9 +383,14 @@ public class RobotsTxt { return null; } - public int getCrawlDelay() { + public long getCrawlDelayMillis() { + if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try { + return Long.parseLong(this.mem.get(CRAWL_DELAY_MILLIS)); + } catch (final NumberFormatException e) { + return 0; + } if (this.mem.containsKey(CRAWL_DELAY)) try { - return Integer.parseInt(this.mem.get(CRAWL_DELAY)); + return 1000 * Integer.parseInt(this.mem.get(CRAWL_DELAY)); } catch (final NumberFormatException e) { return 0; } @@ -458,19 +464,19 @@ public class RobotsTxt { return sitemapURL; } - public Integer getCrawlDelay(final yacyURL theURL) { + public Long getCrawlDelayMillis(final yacyURL theURL) { if (theURL == null) throw new IllegalArgumentException(); - Integer crawlDelay = null; + Long crawlDelay = null; // generating the hostname:poart string needed to do a DB lookup final String urlHostPort = getHostPort(theURL); final RobotsTxt.Entry robotsTxt4Host = getEntry(urlHostPort, true); try { - crawlDelay = robotsTxt4Host.getCrawlDelay(); + crawlDelay = robotsTxt4Host.getCrawlDelayMillis(); } catch (final NumberFormatException e) {/* ignore this */} - return crawlDelay; + return crawlDelay; } public boolean isDisallowed(final yacyURL nexturl) { diff --git a/source/de/anomic/crawler/robotsParser.java b/source/de/anomic/crawler/robotsParser.java index 29fa2da6d..845af07f1 100644 --- a/source/de/anomic/crawler/robotsParser.java +++ b/source/de/anomic/crawler/robotsParser.java @@ -65,14 +65,14 @@ public final class robotsParser { private ArrayList allowList; private ArrayList denyList; private String sitemap; - private int crawlDelay; + private long crawlDelayMillis; public robotsParser(final byte[] robotsTxt) { if ((robotsTxt == null)||(robotsTxt.length == 0)) { allowList = new ArrayList(0); denyList = new ArrayList(0); sitemap = ""; - crawlDelay = 0; + crawlDelayMillis = 0; } else { final ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt); final BufferedReader reader = new BufferedReader(new InputStreamReader(bin)); @@ -85,7 +85,7 @@ public final class robotsParser { allowList = new ArrayList(0); denyList = new ArrayList(0); sitemap = ""; - crawlDelay = 0; + crawlDelayMillis = 0; } else { parse(reader); } @@ -100,7 +100,7 @@ public final class robotsParser { int pos; String line = null, lineUpper = null; sitemap = null; - crawlDelay = 0; + crawlDelayMillis = 0; boolean isRule4AllAgents = false, isRule4YaCyAgent = false, rule4YaCyFound = false, @@ -130,7 +130,7 @@ public final class robotsParser { inBlock = false; isRule4AllAgents = false; isRule4YaCyAgent = false; - crawlDelay = 0; // each block has a separate delay + crawlDelayMillis = 0; // each block has a separate delay } // cutting off comments at the line end @@ -138,7 +138,7 @@ public final class robotsParser { if (pos != -1) line = line.substring(0,pos).trim(); // replacing all tabs with spaces - line = line.replaceAll("\t"," "); + line = line.replaceAll("\t"," ").replaceAll(":"," "); // getting out the robots name pos = line.indexOf(" "); @@ -149,10 +149,14 @@ public final class robotsParser { if (isRule4YaCyAgent) rule4YaCyFound = true; } } else if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) { + // replacing all tabs with spaces + line = line.replaceAll("\t"," ").replaceAll(":"," "); + pos = line.indexOf(" "); if (pos != -1) { try { - crawlDelay = Integer.parseInt(line.substring(pos).trim()); + // the crawl delay can be a float number and means number of seconds + crawlDelayMillis = (long) (1000.0 * Float.parseFloat(line.substring(pos).trim())); } catch (final NumberFormatException e) { // invalid crawling delay } @@ -171,7 +175,7 @@ public final class robotsParser { if (line.endsWith("*")) line = line.substring(0,line.length()-1); // replacing all tabs with spaces - line = line.replaceAll("\t"," "); + line = line.replaceAll("\t"," ").replaceAll(":"," "); // getting the path pos = line.indexOf(" "); @@ -210,8 +214,8 @@ public final class robotsParser { denyList = (rule4YaCyFound) ? deny4YaCyAgent : deny4AllAgents; } - public int crawlDelay() { - return this.crawlDelay; + public long crawlDelayMillis() { + return this.crawlDelayMillis; } public String sitemap() { diff --git a/source/de/anomic/http/JakartaCommonsHttpClient.java b/source/de/anomic/http/JakartaCommonsHttpClient.java index 1e23e8263..bfeb96b5e 100644 --- a/source/de/anomic/http/JakartaCommonsHttpClient.java +++ b/source/de/anomic/http/JakartaCommonsHttpClient.java @@ -144,7 +144,6 @@ public class JakartaCommonsHttpClient { private boolean followRedirects = true; private boolean ignoreCookies = false; - /** * creates a new JakartaCommonsHttpClient with given timeout using global remoteProxyConfig * @@ -434,8 +433,8 @@ public class JakartaCommonsHttpClient { HttpConnectionInfo.addConnection(generateConInfo(method)); // execute (send request) - serverLog.logFine("HTTPC", "executing " + method.hashCode() + " " + method.getName() + " " + method.getURI()); - serverLog.logFinest("HTTPC", "->" + method.hashCode() + " request headers " + + if (serverLog.isFine("HTTPC")) serverLog.logFine("HTTPC", "executing " + method.hashCode() + " " + method.getName() + " " + method.getURI()); + if (serverLog.isFinest("HTTPC")) serverLog.logFinest("HTTPC", "->" + method.hashCode() + " request headers " + Arrays.toString(method.getRequestHeaders())); try { if (hostConfig == null) { @@ -448,7 +447,7 @@ public class JakartaCommonsHttpClient { HttpConnectionInfo.removeConnection(generateConInfo(method)); throw e; } - serverLog.logFinest("HTTPC", "<-" + method.hashCode() + " response headers " + + if (serverLog.isFinest("HTTPC")) serverLog.logFinest("HTTPC", "<-" + method.hashCode() + " response headers " + Arrays.toString(method.getResponseHeaders())); // return response diff --git a/source/de/anomic/http/httpd.java b/source/de/anomic/http/httpd.java index a5fa3d501..19ee3a692 100644 --- a/source/de/anomic/http/httpd.java +++ b/source/de/anomic/http/httpd.java @@ -902,7 +902,7 @@ public final class httpd implements serverHandler, Cloneable { try { items = upload.parseRequest(request); } catch (FileUploadException e) { - e.printStackTrace(); + //e.printStackTrace(); throw new IOException("FileUploadException " + e.getMessage()); } diff --git a/source/de/anomic/index/indexContainer.java b/source/de/anomic/index/indexContainer.java index 71e8b200a..e642e8086 100644 --- a/source/de/anomic/index/indexContainer.java +++ b/source/de/anomic/index/indexContainer.java @@ -314,7 +314,7 @@ public class indexContainer extends kelondroRowSet { } private static indexContainer joinConstructiveByTest(final indexContainer small, final indexContainer large, final int maxDistance) { - System.out.println("DEBUG: JOIN METHOD BY TEST"); + System.out.println("DEBUG: JOIN METHOD BY TEST, maxdistance = " + maxDistance); assert small.rowdef.equals(large.rowdef) : "small = " + small.rowdef.toString() + "; large = " + large.rowdef.toString(); final int keylength = small.rowdef.width(0); assert (keylength == large.rowdef.width(0)); @@ -337,7 +337,7 @@ public class indexContainer extends kelondroRowSet { } private static indexContainer joinConstructiveByEnumeration(final indexContainer i1, final indexContainer i2, final int maxDistance) { - System.out.println("DEBUG: JOIN METHOD BY ENUMERATION"); + System.out.println("DEBUG: JOIN METHOD BY ENUMERATION, maxdistance = " + maxDistance); assert i1.rowdef.equals(i2.rowdef) : "i1 = " + i1.rowdef.toString() + "; i2 = " + i2.rowdef.toString(); final int keylength = i1.rowdef.width(0); assert (keylength == i2.rowdef.width(0)); diff --git a/source/de/anomic/index/indexRWIVarEntry.java b/source/de/anomic/index/indexRWIVarEntry.java index 05355d4fe..d7e03b132 100644 --- a/source/de/anomic/index/indexRWIVarEntry.java +++ b/source/de/anomic/index/indexRWIVarEntry.java @@ -310,7 +310,7 @@ public class indexRWIVarEntry implements indexRWIEntry, Cloneable { // joins two entries into one entry // combine the distance - this.worddistance = this.worddistance + ((oe instanceof indexRWIVarEntry) ? ((indexRWIVarEntry) oe).worddistance : 0) + Math.abs(this.posintext() - oe.posintext()); + this.worddistance = Math.abs(this.posintext() - oe.posintext()); this.posintext = Math.min(this.posintext, oe.posintext()); this.posinphrase = (this.posofphrase == oe.posofphrase()) ? Math.min(this.posinphrase, oe.posinphrase()) : 0; this.posofphrase = Math.min(this.posofphrase, oe.posofphrase()); diff --git a/source/de/anomic/kelondro/kelondroBLOBHeap.java b/source/de/anomic/kelondro/kelondroBLOBHeap.java index 6f3c4aef9..bb681d245 100755 --- a/source/de/anomic/kelondro/kelondroBLOBHeap.java +++ b/source/de/anomic/kelondro/kelondroBLOBHeap.java @@ -145,7 +145,7 @@ public final class kelondroBLOBHeap implements kelondroBLOB { lastFree = i.next(); while (i.hasNext()) { nextFree = i.next(); - System.out.println("*** DEBUG BLOB: free-seek = " + nextFree.seek + ", size = " + nextFree.size); + //System.out.println("*** DEBUG BLOB: free-seek = " + nextFree.seek + ", size = " + nextFree.size); // check if they follow directly if (lastFree.seek + lastFree.size + 4 == nextFree.seek) { // merge those records diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index 4ab75290c..6fbd199c5 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -227,17 +227,15 @@ public final class plasmaSearchQuery { return kelondroMSetTools.anymatch(wordhashes, keyhashes); } + private static String seps = "'.,:/&"; static {seps += '"';} + @SuppressWarnings("unchecked") public static TreeSet[] cleanQuery(String querystring) { // returns two sets: a query set and a exclude set if ((querystring == null) || (querystring.length() == 0)) return new TreeSet[]{new TreeSet(kelondroNaturalOrder.naturalComparator), new TreeSet(kelondroNaturalOrder.naturalComparator)}; // convert Umlaute - querystring = htmlFilterAbstractScraper.convertUmlaute(new serverCharBuffer(querystring.toCharArray())).toString(); - - // remove funny symbols - final String seps = "'.,:/&"; - querystring = querystring.toLowerCase().trim(); + querystring = htmlFilterAbstractScraper.convertUmlaute(new serverCharBuffer(querystring.toCharArray())).toString().toLowerCase().trim(); int c; for (int i = 0; i < seps.length(); i++) { while ((c = querystring.indexOf(seps.charAt(i))) >= 0) { querystring = querystring.substring(0, c) + (((c + 1) < querystring.length()) ? (" " + querystring.substring(c + 1)) : ""); } @@ -288,7 +286,8 @@ public final class plasmaSearchQuery { "*" + indexWord.word2hash(this.ranking.toExternalString()) + "*" + this.prefer + "*" + this.urlMask + - "*" + this.constraint; + "*" + this.constraint + + "*" + this.maxDistance; if (anonymized) return anonymizedQueryHashes(this.queryHashes) + "-" + anonymizedQueryHashes(this.excludeHashes) + context; else diff --git a/source/de/anomic/server/logging/serverLog.java b/source/de/anomic/server/logging/serverLog.java index 6017fade6..e311d7f61 100644 --- a/source/de/anomic/server/logging/serverLog.java +++ b/source/de/anomic/server/logging/serverLog.java @@ -103,8 +103,8 @@ public final class serverLog { public static void logSevere(final String appName, final String message, final Throwable thrown) { Logger.getLogger(appName).log(Level.SEVERE,message,thrown); } - public static void isSevere(final String appName) { - Logger.getLogger(appName).isLoggable(Level.SEVERE); + public static boolean isSevere(final String appName) { + return Logger.getLogger(appName).isLoggable(Level.SEVERE); } public static void logWarning(final String appName, final String message) { @@ -113,8 +113,8 @@ public final class serverLog { public static void logWarning(final String appName, final String message, final Throwable thrown) { Logger.getLogger(appName).log(Level.WARNING,message,thrown); } - public static void isWarning(final String appName) { - Logger.getLogger(appName).isLoggable(Level.WARNING); + public static boolean isWarning(final String appName) { + return Logger.getLogger(appName).isLoggable(Level.WARNING); } public static void logConfig(final String appName, final String message) { @@ -123,8 +123,8 @@ public final class serverLog { public static void logConfig(final String appName, final String message, final Throwable thrown) { Logger.getLogger(appName).log(Level.CONFIG,message,thrown); } - public static void isConfig(final String appName) { - Logger.getLogger(appName).isLoggable(Level.CONFIG); + public static boolean isConfig(final String appName) { + return Logger.getLogger(appName).isLoggable(Level.CONFIG); } public static void logInfo(final String appName, final String message) { @@ -133,8 +133,8 @@ public final class serverLog { public static void logInfo(final String appName, final String message, final Throwable thrown) { Logger.getLogger(appName).log(Level.INFO,message,thrown); } - public static void isInfo(final String appName) { - Logger.getLogger(appName).isLoggable(Level.INFO); + public static boolean isInfo(final String appName) { + return Logger.getLogger(appName).isLoggable(Level.INFO); } public static void logFine(final String appName, final String message) { @@ -143,8 +143,8 @@ public final class serverLog { public static void logFine(final String appName, final String message, final Throwable thrown) { Logger.getLogger(appName).log(Level.FINE,message,thrown); } - public static void isFine(final String appName) { - Logger.getLogger(appName).isLoggable(Level.FINE); + public static boolean isFine(final String appName) { + return Logger.getLogger(appName).isLoggable(Level.FINE); } public static void logFiner(final String appName, final String message) { @@ -153,8 +153,8 @@ public final class serverLog { public static void logFiner(final String appName, final String message, final Throwable thrown) { Logger.getLogger(appName).log(Level.FINER,message,thrown); } - public static void isFiner(final String appName) { - Logger.getLogger(appName).isLoggable(Level.FINER); + public static boolean isFiner(final String appName) { + return Logger.getLogger(appName).isLoggable(Level.FINER); } public static void logFinest(final String appName, final String message) { @@ -163,8 +163,8 @@ public final class serverLog { public static void logFinest(final String appName, final String message, final Throwable thrown) { Logger.getLogger(appName).log(Level.FINEST,message,thrown); } - public static void isFinest(final String appName) { - Logger.getLogger(appName).isLoggable(Level.FINEST); + public static boolean isFinest(final String appName) { + return Logger.getLogger(appName).isLoggable(Level.FINEST); } public static final void configureLogging(final File homePath, final File loggingConfigFile) throws SecurityException, FileNotFoundException, IOException { diff --git a/source/de/anomic/server/serverProfiling.java b/source/de/anomic/server/serverProfiling.java index a932225f4..f35c0586f 100644 --- a/source/de/anomic/server/serverProfiling.java +++ b/source/de/anomic/server/serverProfiling.java @@ -33,14 +33,8 @@ import java.util.concurrent.ConcurrentLinkedQueue; public class serverProfiling extends Thread { - /** - * key=name of history, value=TreeMap of Long/Event - */ - private static final Map> historyMaps = new ConcurrentHashMap>();; - /** - * key=name of history, value=Integer of event counter - */ - private static final Map eventCounter = new ConcurrentHashMap(); + private static final Map> historyMaps = new ConcurrentHashMap>(); // value=TreeMap of Long/Event + private static final Map eventAccess = new ConcurrentHashMap(); // value: last time when this was accessed private static serverProfiling systemProfiler = null; public static void startSystemProfiling() { @@ -73,14 +67,21 @@ public class serverProfiling extends Thread { public static void update(final String eventName, final Object eventPayload) { // get event history container - int counter = eventCounter.containsKey(eventName) ? (eventCounter.get(eventName)).intValue() : 0; - if (historyMaps.containsKey(eventName)) { - final ConcurrentLinkedQueue history = historyMaps.get(eventName); + Long lastAcc = eventAccess.get(eventName); + if (lastAcc == null) { + eventAccess.put(eventName, new Long(System.currentTimeMillis())); + } else { + if (System.currentTimeMillis() - lastAcc.longValue() > 1000) { + eventAccess.put(eventName, new Long(System.currentTimeMillis())); + } else { + return; // protect against too heavy load + } + } + ConcurrentLinkedQueue history = historyMaps.get(eventName); + if (history != null) { // update entry - history.add(new Event(counter, eventPayload)); - counter++; - eventCounter.put(eventName, Integer.valueOf(counter)); + history.add(new Event(eventPayload)); // clean up too old entries Event e; @@ -91,12 +92,10 @@ public class serverProfiling extends Thread { history.poll(); } } else { - final ConcurrentLinkedQueue history = new ConcurrentLinkedQueue(); + history = new ConcurrentLinkedQueue(); // update entry - history.add(new Event(counter, eventPayload)); - counter++; - eventCounter.put(eventName, Integer.valueOf(counter)); + history.add(new Event(eventPayload)); // store map historyMaps.put(eventName, history); @@ -108,12 +107,10 @@ public class serverProfiling extends Thread { } public static class Event { - public int count; public Object payload; public long time; - public Event(final int count, final Object payload) { - this.count = count; + public Event(final Object payload) { this.payload = payload; this.time = System.currentTimeMillis(); }