- fixed problem with searching with quotes (still not complete, but not as bad as before)

- fixed parsing of crawl-delay statements when seconds were given with float numbers
- enhanced performance of profiling (not too many loggings; not more than one per second)
- removed some debug output
- fixed wrong return type in logging
- added a logging condition in httpd to prevent that logging statements are generated when they are not written (should be added everywhere!)
- fixed wrong word distance computation in RWI management


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5101 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 3a0e96b552
commit d3d41e2ee4

@ -117,13 +117,6 @@ public class yacysearch {
}
// collect search attributes
int maxDistance = Integer.MAX_VALUE;
if ((querystring.length() > 2) && (querystring.charAt(0) == '"') && (querystring.charAt(querystring.length() - 1) == '"')) {
querystring = querystring.substring(1, querystring.length() - 1).trim();
maxDistance = 1;
}
int itemsPerPage = Math.min((authenticated) ? 1000 : 10, post.getInt("maximumRecords", post.getInt("count", 10))); // SRU syntax with old property as alternative
int offset = (post.hasValue("query") && post.hasValue("former") && !post.get("query","").equalsIgnoreCase(post.get("former",""))) ? 0 : post.getInt("startRecord", post.getInt("offset", 0));
@ -181,6 +174,7 @@ public class yacysearch {
}
if ((!block) && (post == null || post.get("cat", "href").equals("href"))) {
// check available memory and clean up if necessary
if (!serverMemory.request(8000000L, false)) {
sb.webIndex.clearCache();
@ -197,7 +191,9 @@ public class yacysearch {
query[0].remove("recent");
ranking.coeff_date = plasmaSearchRankingProfile.COEFF_MAX;
}
int maxDistance = (querystring.indexOf('"') >= 0) ? maxDistance = query.length - 1 : Integer.MAX_VALUE;
// filter out stopwords
final TreeSet<String> filtered = kelondroMSetTools.joinConstructive(query[0], plasmaSwitchboard.stopwords);
if (filtered.size() > 0) {

@ -493,7 +493,7 @@ public class Balancer {
15000,
Math.max(
(crawlEntry.url().isLocal()) ? minimumLocalDelta : minimumGlobalDelta,
plasmaSwitchboard.getSwitchboard().robots.crawlDelay(crawlEntry.url()) * 1000)
plasmaSwitchboard.getSwitchboard().robots.crawlDelayMillis(crawlEntry.url()))
); // prevent that that robots file can stop our indexer completely
if (delta < genericDelta) {
// force a busy waiting here

@ -208,7 +208,7 @@ public class RobotsTxt {
(Date) result[DOWNLOAD_MODDATE],
(String) result[DOWNLOAD_ETAG],
parserResult.sitemap(),
parserResult.crawlDelay());
parserResult.crawlDelayMillis());
}
}
}
@ -216,10 +216,10 @@ public class RobotsTxt {
return robotsTxt4Host;
}
public int crawlDelay(final yacyURL theURL) {
public long crawlDelayMillis(final yacyURL theURL) {
final String urlHostPort = getHostPort(theURL);
final RobotsTxt.Entry robotsEntry = getEntry(urlHostPort, true);
return robotsEntry.getCrawlDelay();
return robotsEntry.getCrawlDelayMillis();
}
private Entry addEntry(
@ -230,11 +230,11 @@ public class RobotsTxt {
final Date modDate,
final String eTag,
final String sitemap,
final int crawlDelay
final long crawlDelayMillis
) {
final Entry entry = new Entry(
hostName, allowPathList, denyPathList, loadedDate, modDate,
eTag, sitemap, crawlDelay);
eTag, sitemap, crawlDelayMillis);
addEntry(entry);
return entry;
}
@ -257,6 +257,7 @@ public class RobotsTxt {
public static final String ETAG = "etag";
public static final String SITEMAP = "sitemap";
public static final String CRAWL_DELAY = "crawlDelay";
public static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis";
// this is a simple record structure that holds all properties of a single crawl start
HashMap<String, String> mem;
@ -301,7 +302,7 @@ public class RobotsTxt {
final Date modDate,
final String eTag,
final String sitemap,
final int crawlDelay
final long crawlDelayMillis
) {
if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException("The hostname is missing");
@ -314,7 +315,7 @@ public class RobotsTxt {
if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime()));
if (eTag != null) this.mem.put(ETAG,eTag);
if (sitemap != null) this.mem.put(SITEMAP,sitemap);
if (crawlDelay != 0) this.mem.put(CRAWL_DELAY, Integer.toString(crawlDelay));
if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, Long.toString(crawlDelayMillis));
if ((allowPathList != null)&&(allowPathList.size()>0)) {
this.allowPathList.addAll(allowPathList);
@ -382,9 +383,14 @@ public class RobotsTxt {
return null;
}
public int getCrawlDelay() {
public long getCrawlDelayMillis() {
if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try {
return Long.parseLong(this.mem.get(CRAWL_DELAY_MILLIS));
} catch (final NumberFormatException e) {
return 0;
}
if (this.mem.containsKey(CRAWL_DELAY)) try {
return Integer.parseInt(this.mem.get(CRAWL_DELAY));
return 1000 * Integer.parseInt(this.mem.get(CRAWL_DELAY));
} catch (final NumberFormatException e) {
return 0;
}
@ -458,19 +464,19 @@ public class RobotsTxt {
return sitemapURL;
}
public Integer getCrawlDelay(final yacyURL theURL) {
public Long getCrawlDelayMillis(final yacyURL theURL) {
if (theURL == null) throw new IllegalArgumentException();
Integer crawlDelay = null;
Long crawlDelay = null;
// generating the hostname:poart string needed to do a DB lookup
final String urlHostPort = getHostPort(theURL);
final RobotsTxt.Entry robotsTxt4Host = getEntry(urlHostPort, true);
try {
crawlDelay = robotsTxt4Host.getCrawlDelay();
crawlDelay = robotsTxt4Host.getCrawlDelayMillis();
} catch (final NumberFormatException e) {/* ignore this */}
return crawlDelay;
return crawlDelay;
}
public boolean isDisallowed(final yacyURL nexturl) {

@ -65,14 +65,14 @@ public final class robotsParser {
private ArrayList<String> allowList;
private ArrayList<String> denyList;
private String sitemap;
private int crawlDelay;
private long crawlDelayMillis;
public robotsParser(final byte[] robotsTxt) {
if ((robotsTxt == null)||(robotsTxt.length == 0)) {
allowList = new ArrayList<String>(0);
denyList = new ArrayList<String>(0);
sitemap = "";
crawlDelay = 0;
crawlDelayMillis = 0;
} else {
final ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt);
final BufferedReader reader = new BufferedReader(new InputStreamReader(bin));
@ -85,7 +85,7 @@ public final class robotsParser {
allowList = new ArrayList<String>(0);
denyList = new ArrayList<String>(0);
sitemap = "";
crawlDelay = 0;
crawlDelayMillis = 0;
} else {
parse(reader);
}
@ -100,7 +100,7 @@ public final class robotsParser {
int pos;
String line = null, lineUpper = null;
sitemap = null;
crawlDelay = 0;
crawlDelayMillis = 0;
boolean isRule4AllAgents = false,
isRule4YaCyAgent = false,
rule4YaCyFound = false,
@ -130,7 +130,7 @@ public final class robotsParser {
inBlock = false;
isRule4AllAgents = false;
isRule4YaCyAgent = false;
crawlDelay = 0; // each block has a separate delay
crawlDelayMillis = 0; // each block has a separate delay
}
// cutting off comments at the line end
@ -138,7 +138,7 @@ public final class robotsParser {
if (pos != -1) line = line.substring(0,pos).trim();
// replacing all tabs with spaces
line = line.replaceAll("\t"," ");
line = line.replaceAll("\t"," ").replaceAll(":"," ");
// getting out the robots name
pos = line.indexOf(" ");
@ -149,10 +149,14 @@ public final class robotsParser {
if (isRule4YaCyAgent) rule4YaCyFound = true;
}
} else if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
// replacing all tabs with spaces
line = line.replaceAll("\t"," ").replaceAll(":"," ");
pos = line.indexOf(" ");
if (pos != -1) {
try {
crawlDelay = Integer.parseInt(line.substring(pos).trim());
// the crawl delay can be a float number and means number of seconds
crawlDelayMillis = (long) (1000.0 * Float.parseFloat(line.substring(pos).trim()));
} catch (final NumberFormatException e) {
// invalid crawling delay
}
@ -171,7 +175,7 @@ public final class robotsParser {
if (line.endsWith("*")) line = line.substring(0,line.length()-1);
// replacing all tabs with spaces
line = line.replaceAll("\t"," ");
line = line.replaceAll("\t"," ").replaceAll(":"," ");
// getting the path
pos = line.indexOf(" ");
@ -210,8 +214,8 @@ public final class robotsParser {
denyList = (rule4YaCyFound) ? deny4YaCyAgent : deny4AllAgents;
}
public int crawlDelay() {
return this.crawlDelay;
public long crawlDelayMillis() {
return this.crawlDelayMillis;
}
public String sitemap() {

@ -144,7 +144,6 @@ public class JakartaCommonsHttpClient {
private boolean followRedirects = true;
private boolean ignoreCookies = false;
/**
* creates a new JakartaCommonsHttpClient with given timeout using global remoteProxyConfig
*
@ -434,8 +433,8 @@ public class JakartaCommonsHttpClient {
HttpConnectionInfo.addConnection(generateConInfo(method));
// execute (send request)
serverLog.logFine("HTTPC", "executing " + method.hashCode() + " " + method.getName() + " " + method.getURI());
serverLog.logFinest("HTTPC", "->" + method.hashCode() + " request headers " +
if (serverLog.isFine("HTTPC")) serverLog.logFine("HTTPC", "executing " + method.hashCode() + " " + method.getName() + " " + method.getURI());
if (serverLog.isFinest("HTTPC")) serverLog.logFinest("HTTPC", "->" + method.hashCode() + " request headers " +
Arrays.toString(method.getRequestHeaders()));
try {
if (hostConfig == null) {
@ -448,7 +447,7 @@ public class JakartaCommonsHttpClient {
HttpConnectionInfo.removeConnection(generateConInfo(method));
throw e;
}
serverLog.logFinest("HTTPC", "<-" + method.hashCode() + " response headers " +
if (serverLog.isFinest("HTTPC")) serverLog.logFinest("HTTPC", "<-" + method.hashCode() + " response headers " +
Arrays.toString(method.getResponseHeaders()));
// return response

@ -902,7 +902,7 @@ public final class httpd implements serverHandler, Cloneable {
try {
items = upload.parseRequest(request);
} catch (FileUploadException e) {
e.printStackTrace();
//e.printStackTrace();
throw new IOException("FileUploadException " + e.getMessage());
}

@ -314,7 +314,7 @@ public class indexContainer extends kelondroRowSet {
}
private static indexContainer joinConstructiveByTest(final indexContainer small, final indexContainer large, final int maxDistance) {
System.out.println("DEBUG: JOIN METHOD BY TEST");
System.out.println("DEBUG: JOIN METHOD BY TEST, maxdistance = " + maxDistance);
assert small.rowdef.equals(large.rowdef) : "small = " + small.rowdef.toString() + "; large = " + large.rowdef.toString();
final int keylength = small.rowdef.width(0);
assert (keylength == large.rowdef.width(0));
@ -337,7 +337,7 @@ public class indexContainer extends kelondroRowSet {
}
private static indexContainer joinConstructiveByEnumeration(final indexContainer i1, final indexContainer i2, final int maxDistance) {
System.out.println("DEBUG: JOIN METHOD BY ENUMERATION");
System.out.println("DEBUG: JOIN METHOD BY ENUMERATION, maxdistance = " + maxDistance);
assert i1.rowdef.equals(i2.rowdef) : "i1 = " + i1.rowdef.toString() + "; i2 = " + i2.rowdef.toString();
final int keylength = i1.rowdef.width(0);
assert (keylength == i2.rowdef.width(0));

@ -310,7 +310,7 @@ public class indexRWIVarEntry implements indexRWIEntry, Cloneable {
// joins two entries into one entry
// combine the distance
this.worddistance = this.worddistance + ((oe instanceof indexRWIVarEntry) ? ((indexRWIVarEntry) oe).worddistance : 0) + Math.abs(this.posintext() - oe.posintext());
this.worddistance = Math.abs(this.posintext() - oe.posintext());
this.posintext = Math.min(this.posintext, oe.posintext());
this.posinphrase = (this.posofphrase == oe.posofphrase()) ? Math.min(this.posinphrase, oe.posinphrase()) : 0;
this.posofphrase = Math.min(this.posofphrase, oe.posofphrase());

@ -145,7 +145,7 @@ public final class kelondroBLOBHeap implements kelondroBLOB {
lastFree = i.next();
while (i.hasNext()) {
nextFree = i.next();
System.out.println("*** DEBUG BLOB: free-seek = " + nextFree.seek + ", size = " + nextFree.size);
//System.out.println("*** DEBUG BLOB: free-seek = " + nextFree.seek + ", size = " + nextFree.size);
// check if they follow directly
if (lastFree.seek + lastFree.size + 4 == nextFree.seek) {
// merge those records

@ -227,17 +227,15 @@ public final class plasmaSearchQuery {
return kelondroMSetTools.anymatch(wordhashes, keyhashes);
}
private static String seps = "'.,:/&"; static {seps += '"';}
@SuppressWarnings("unchecked")
public static TreeSet<String>[] cleanQuery(String querystring) {
// returns two sets: a query set and a exclude set
if ((querystring == null) || (querystring.length() == 0)) return new TreeSet[]{new TreeSet<String>(kelondroNaturalOrder.naturalComparator), new TreeSet<String>(kelondroNaturalOrder.naturalComparator)};
// convert Umlaute
querystring = htmlFilterAbstractScraper.convertUmlaute(new serverCharBuffer(querystring.toCharArray())).toString();
// remove funny symbols
final String seps = "'.,:/&";
querystring = querystring.toLowerCase().trim();
querystring = htmlFilterAbstractScraper.convertUmlaute(new serverCharBuffer(querystring.toCharArray())).toString().toLowerCase().trim();
int c;
for (int i = 0; i < seps.length(); i++) {
while ((c = querystring.indexOf(seps.charAt(i))) >= 0) { querystring = querystring.substring(0, c) + (((c + 1) < querystring.length()) ? (" " + querystring.substring(c + 1)) : ""); }
@ -288,7 +286,8 @@ public final class plasmaSearchQuery {
"*" + indexWord.word2hash(this.ranking.toExternalString()) +
"*" + this.prefer +
"*" + this.urlMask +
"*" + this.constraint;
"*" + this.constraint +
"*" + this.maxDistance;
if (anonymized)
return anonymizedQueryHashes(this.queryHashes) + "-" + anonymizedQueryHashes(this.excludeHashes) + context;
else

@ -103,8 +103,8 @@ public final class serverLog {
public static void logSevere(final String appName, final String message, final Throwable thrown) {
Logger.getLogger(appName).log(Level.SEVERE,message,thrown);
}
public static void isSevere(final String appName) {
Logger.getLogger(appName).isLoggable(Level.SEVERE);
public static boolean isSevere(final String appName) {
return Logger.getLogger(appName).isLoggable(Level.SEVERE);
}
public static void logWarning(final String appName, final String message) {
@ -113,8 +113,8 @@ public final class serverLog {
public static void logWarning(final String appName, final String message, final Throwable thrown) {
Logger.getLogger(appName).log(Level.WARNING,message,thrown);
}
public static void isWarning(final String appName) {
Logger.getLogger(appName).isLoggable(Level.WARNING);
public static boolean isWarning(final String appName) {
return Logger.getLogger(appName).isLoggable(Level.WARNING);
}
public static void logConfig(final String appName, final String message) {
@ -123,8 +123,8 @@ public final class serverLog {
public static void logConfig(final String appName, final String message, final Throwable thrown) {
Logger.getLogger(appName).log(Level.CONFIG,message,thrown);
}
public static void isConfig(final String appName) {
Logger.getLogger(appName).isLoggable(Level.CONFIG);
public static boolean isConfig(final String appName) {
return Logger.getLogger(appName).isLoggable(Level.CONFIG);
}
public static void logInfo(final String appName, final String message) {
@ -133,8 +133,8 @@ public final class serverLog {
public static void logInfo(final String appName, final String message, final Throwable thrown) {
Logger.getLogger(appName).log(Level.INFO,message,thrown);
}
public static void isInfo(final String appName) {
Logger.getLogger(appName).isLoggable(Level.INFO);
public static boolean isInfo(final String appName) {
return Logger.getLogger(appName).isLoggable(Level.INFO);
}
public static void logFine(final String appName, final String message) {
@ -143,8 +143,8 @@ public final class serverLog {
public static void logFine(final String appName, final String message, final Throwable thrown) {
Logger.getLogger(appName).log(Level.FINE,message,thrown);
}
public static void isFine(final String appName) {
Logger.getLogger(appName).isLoggable(Level.FINE);
public static boolean isFine(final String appName) {
return Logger.getLogger(appName).isLoggable(Level.FINE);
}
public static void logFiner(final String appName, final String message) {
@ -153,8 +153,8 @@ public final class serverLog {
public static void logFiner(final String appName, final String message, final Throwable thrown) {
Logger.getLogger(appName).log(Level.FINER,message,thrown);
}
public static void isFiner(final String appName) {
Logger.getLogger(appName).isLoggable(Level.FINER);
public static boolean isFiner(final String appName) {
return Logger.getLogger(appName).isLoggable(Level.FINER);
}
public static void logFinest(final String appName, final String message) {
@ -163,8 +163,8 @@ public final class serverLog {
public static void logFinest(final String appName, final String message, final Throwable thrown) {
Logger.getLogger(appName).log(Level.FINEST,message,thrown);
}
public static void isFinest(final String appName) {
Logger.getLogger(appName).isLoggable(Level.FINEST);
public static boolean isFinest(final String appName) {
return Logger.getLogger(appName).isLoggable(Level.FINEST);
}
public static final void configureLogging(final File homePath, final File loggingConfigFile) throws SecurityException, FileNotFoundException, IOException {

@ -33,14 +33,8 @@ import java.util.concurrent.ConcurrentLinkedQueue;
public class serverProfiling extends Thread {
/**
* key=name of history, value=TreeMap of Long/Event
*/
private static final Map<String, ConcurrentLinkedQueue<Event>> historyMaps = new ConcurrentHashMap<String, ConcurrentLinkedQueue<Event>>();;
/**
* key=name of history, value=Integer of event counter
*/
private static final Map<String, Integer> eventCounter = new ConcurrentHashMap<String, Integer>();
private static final Map<String, ConcurrentLinkedQueue<Event>> historyMaps = new ConcurrentHashMap<String, ConcurrentLinkedQueue<Event>>(); // value=TreeMap of Long/Event
private static final Map<String, Long> eventAccess = new ConcurrentHashMap<String, Long>(); // value: last time when this was accessed
private static serverProfiling systemProfiler = null;
public static void startSystemProfiling() {
@ -73,14 +67,21 @@ public class serverProfiling extends Thread {
public static void update(final String eventName, final Object eventPayload) {
// get event history container
int counter = eventCounter.containsKey(eventName) ? (eventCounter.get(eventName)).intValue() : 0;
if (historyMaps.containsKey(eventName)) {
final ConcurrentLinkedQueue<Event> history = historyMaps.get(eventName);
Long lastAcc = eventAccess.get(eventName);
if (lastAcc == null) {
eventAccess.put(eventName, new Long(System.currentTimeMillis()));
} else {
if (System.currentTimeMillis() - lastAcc.longValue() > 1000) {
eventAccess.put(eventName, new Long(System.currentTimeMillis()));
} else {
return; // protect against too heavy load
}
}
ConcurrentLinkedQueue<Event> history = historyMaps.get(eventName);
if (history != null) {
// update entry
history.add(new Event(counter, eventPayload));
counter++;
eventCounter.put(eventName, Integer.valueOf(counter));
history.add(new Event(eventPayload));
// clean up too old entries
Event e;
@ -91,12 +92,10 @@ public class serverProfiling extends Thread {
history.poll();
}
} else {
final ConcurrentLinkedQueue<Event> history = new ConcurrentLinkedQueue<Event>();
history = new ConcurrentLinkedQueue<Event>();
// update entry
history.add(new Event(counter, eventPayload));
counter++;
eventCounter.put(eventName, Integer.valueOf(counter));
history.add(new Event(eventPayload));
// store map
historyMaps.put(eventName, history);
@ -108,12 +107,10 @@ public class serverProfiling extends Thread {
}
public static class Event {
public int count;
public Object payload;
public long time;
public Event(final int count, final Object payload) {
this.count = count;
public Event(final Object payload) {
this.payload = payload;
this.time = System.currentTimeMillis();
}

Loading…
Cancel
Save