From c7a614830af9d6cca94a0e7ec9de0b2315602c4a Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 15 Jun 2007 17:45:49 +0000 Subject: [PATCH] several bugfixes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3899 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/AccessTracker_p.java | 12 +++++++++--- htroot/CrawlStartExpert_p.html | 2 +- htroot/CrawlStartExpert_p.java | 2 +- htroot/CrawlStartSimple_p.html | 2 +- htroot/IndexControl_p.java | 16 +++++++++++++--- source/de/anomic/plasma/plasmaCrawlProfile.java | 4 ++-- source/de/anomic/plasma/plasmaCrawlStacker.java | 4 ++-- source/de/anomic/plasma/plasmaSwitchboard.java | 7 +++++++ source/de/anomic/server/serverMemory.java | 10 ++++------ 9 files changed, 40 insertions(+), 19 deletions(-) diff --git a/htroot/AccessTracker_p.java b/htroot/AccessTracker_p.java index 7a6874269..169c2f0af 100644 --- a/htroot/AccessTracker_p.java +++ b/htroot/AccessTracker_p.java @@ -72,7 +72,7 @@ public class AccessTracker_p { prop.put("page_list_" + entCount + "_countHour", access.tailMap(new Long(System.currentTimeMillis() - 1000 * 60 * 60)).size()); entCount++; } - } catch (ConcurrentModificationException e) {} // we dont want to serialize this + } catch (ConcurrentModificationException e) {} // we dont want to synchronize this prop.put("page_list", entCount); prop.put("page_num", entCount); } @@ -84,6 +84,7 @@ public class AccessTracker_p { if (host.length() > 0) { access = switchboard.accessTrack(host); if (access != null) { + try { Iterator ii = access.entrySet().iterator(); while (ii.hasNext()) { entry = (Map.Entry) ii.next(); @@ -91,13 +92,15 @@ public class AccessTracker_p { prop.put("page_list_" + entCount + "_date", yacyCore.universalDateShortString(new Date(((Long) entry.getKey()).longValue()))); prop.put("page_list_" + entCount + "_path", (String) entry.getValue()); entCount++; - } + }} catch (ConcurrentModificationException e) {} // we dont want to synchronize this + } } else { Iterator i = switchboard.accessHosts(); while ((entCount < maxCount) && (i.hasNext())) { host = (String) i.next(); access = switchboard.accessTrack(host); + try { Iterator ii = access.entrySet().iterator(); while (ii.hasNext()) { entry = (Map.Entry) ii.next(); @@ -105,7 +108,8 @@ public class AccessTracker_p { prop.put("page_list_" + entCount + "_date", yacyCore.universalDateShortString(new Date(((Long) entry.getKey()).longValue()))); prop.put("page_list_" + entCount + "_path", (String) entry.getValue()); entCount++; - } + }} catch (ConcurrentModificationException e) {} // we dont want to synchronize this + } } prop.put("page_list", entCount); @@ -149,6 +153,7 @@ public class AccessTracker_p { TreeSet handles; int entCount = 0; Map.Entry entry; + try { while ((entCount < maxCount) && (i.hasNext())) { entry = (Map.Entry) i.next(); host = (String) entry.getKey(); @@ -177,6 +182,7 @@ public class AccessTracker_p { // next entCount++; } + } catch (ConcurrentModificationException e) {} // we dont want to synchronize this prop.put("page_list", entCount); prop.put("page_num", entCount); prop.put("page_total", (page == 3) ? switchboard.localSearches.size() : switchboard.remoteSearches.size()); diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html index 60e0fa073..f5ca9628b 100644 --- a/htroot/CrawlStartExpert_p.html +++ b/htroot/CrawlStartExpert_p.html @@ -16,7 +16,7 @@ You can define URLs as start points for Web page crawling and start crawling here. "Crawling" means that YaCy will download the given website, extract all links in it and then download the content behind these links. This is repeated as long as specified under "Crawling Depth".

-
+ diff --git a/htroot/CrawlStartExpert_p.java b/htroot/CrawlStartExpert_p.java index dc825b869..cfa5cb92b 100644 --- a/htroot/CrawlStartExpert_p.java +++ b/htroot/CrawlStartExpert_p.java @@ -48,7 +48,7 @@ public class CrawlStartExpert_p { prop.put("crawlingIfOlderUnitHourCheck", 0); prop.put("crawlingIfOlderUnitMinuteCheck", 0); if ((crawlingIfOlder == -1) || (crawlingIfOlder == Integer.MAX_VALUE)) { - prop.put("crawlingIfOlderNumber", -1); + prop.put("crawlingIfOlderNumber", 1); prop.put("crawlingIfOlderUnitYearCheck", 1); } else if (crawlingIfOlder >= 60*24*365) { prop.put("crawlingIfOlderNumber", Math.round((float)crawlingIfOlder / (float)(60*24*365))); diff --git a/htroot/CrawlStartSimple_p.html b/htroot/CrawlStartSimple_p.html index 57b85eb1e..a86c0a4de 100644 --- a/htroot/CrawlStartSimple_p.html +++ b/htroot/CrawlStartSimple_p.html @@ -19,7 +19,7 @@ This is repeated as long as specified under "Crawling Depth".

- + diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 4aa95927a..e2e8ca31c 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -137,7 +137,7 @@ public class IndexControl_p { } if (delurl || delurlref) { for (int i = 0; i < urlx.length; i++) { - switchboard.wordIndex.loadedURL.remove(urlx[i]); + switchboard.urlRemove(urlx[i]); } } switchboard.wordIndex.deleteContainer(keyhash); @@ -157,7 +157,7 @@ public class IndexControl_p { } if (delurl || delurlref) { for (int i = 0; i < urlx.length; i++) { - switchboard.wordIndex.loadedURL.remove(urlx[i]); + switchboard.urlRemove(urlx[i]); } } Set urlHashes = new HashSet(); @@ -190,7 +190,17 @@ public class IndexControl_p { } else { urlstring = entry.comp().url().toNormalform(); prop.put("urlstring", ""); - switchboard.wordIndex.loadedURL.remove(urlhash); + switchboard.urlRemove(urlhash); + prop.put("result", "Removed URL " + urlstring); + } + } + + if (post.containsKey("urldelete")) { + urlhash = plasmaURL.urlHash(urlstring); + if ((urlhash == null) || (urlstring == null)) { + prop.put("result", "No input given; nothing deleted."); + } else { + switchboard.urlRemove(urlhash); prop.put("result", "Removed URL " + urlstring); } } diff --git a/source/de/anomic/plasma/plasmaCrawlProfile.java b/source/de/anomic/plasma/plasmaCrawlProfile.java index a5b1f08ed..089b22d00 100644 --- a/source/de/anomic/plasma/plasmaCrawlProfile.java +++ b/source/de/anomic/plasma/plasmaCrawlProfile.java @@ -346,8 +346,8 @@ public class plasmaCrawlProfile { // an antry must have to be re-crawled String r = (String) mem.get(RECRAWL_IF_OLDER); if (r == null) return Long.MAX_VALUE; else try { - long l = Long.parseLong(r) * ((long) 60000); - if (l < 0) return Long.MAX_VALUE; else return l; + long l = Long.parseLong(r) * 60000L; + return (l < 0) ? Long.MAX_VALUE : l; } catch (NumberFormatException e) { return 0; } diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 79aaf4c80..8122f8fad 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -379,8 +379,8 @@ public final class plasmaCrawlStacker { String dbocc = this.sb.urlExists(nexturlhash); indexURLEntry oldEntry = null; oldEntry = this.sb.wordIndex.loadedURL.load(nexturlhash, null); - boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder()); - // FIXME: this does not work correctly? + boolean recrawl = (oldEntry != null) && ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) > profile.recrawlIfOlder()); + // apply recrawl rule if ((dbocc != null) && (!(recrawl))) { reason = plasmaCrawlEURL.DOUBLE_REGISTERED + dbocc + ")"; //this.log.logFine("URL '" + nexturlString + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index ab4e222de..38db5cdf8 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1423,6 +1423,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser return null; } + public void urlRemove(String hash) { + wordIndex.loadedURL.remove(hash); + noticeURL.remove(hash); + delegatedURL.remove(hash); + errorURL.remove(hash); + } + public URL getURL(String urlhash) throws IOException { if (urlhash.equals(plasmaURL.dummyHash)) return null; plasmaCrawlEntry ne = noticeURL.get(urlhash); diff --git a/source/de/anomic/server/serverMemory.java b/source/de/anomic/server/serverMemory.java index 9e5a27576..a813e65b8 100644 --- a/source/de/anomic/server/serverMemory.java +++ b/source/de/anomic/server/serverMemory.java @@ -119,17 +119,15 @@ public class serverMemory { * @return whether enough memory could be freed (or is free) or not */ public static boolean request(final long size, final boolean force) { - long avail; + long avail = available(); + if (avail >= size) return true; if (log.isFine()) { String t = new Throwable("Stack trace").getStackTrace()[1].toString(); - avail = available(); log.logFine(t + " requested " + (size >>> 10) + " KB, got " + (avail >>> 10) + " KB"); - } else { - avail = available(); - } - if (avail >= size) return true; + } final long avg = getAverageGCFree(); if (force || avg == 0 || avg + avail >= size) { + // this is only called if we expect that an allocation of bytes would cause the jvm to call the GC anyway final long freed = runGC(!force); avail = available(); log.logInfo("performed " + ((force) ? "explicit" : "necessary") + " GC, freed " + (freed >>> 10)
Attribut