From 98abe0804d05d75ef052f3e6eaa9d1d959939a60 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 2 Oct 2007 20:30:42 +0000 Subject: [PATCH] another enhancement to crawl starts with link files git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4123 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/WatchCrawler_p.java | 21 +++++++--------- .../de/anomic/plasma/plasmaCrawlStacker.java | 24 ++++++++++++------- .../de/anomic/plasma/plasmaSwitchboard.java | 2 +- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java index 99a312b89..393751991 100644 --- a/htroot/WatchCrawler_p.java +++ b/htroot/WatchCrawler_p.java @@ -276,7 +276,6 @@ public class WatchCrawler_p { // loop through the contained links Iterator linkiterator = hyperlinks.entrySet().iterator(); - int c = 0; while (linkiterator.hasNext()) { Map.Entry e = (Map.Entry) linkiterator.next(); String nexturlstring = (String) e.getKey(); @@ -294,21 +293,19 @@ public class WatchCrawler_p { nexturlURL = new yacyURL(nexturlstring, null); } catch (MalformedURLException ex) { nexturlURL = null; - c++; continue; } // enqueuing the url for crawling - String rejectReason = switchboard.sbStackCrawlThread.stackCrawl(nexturlstring, null, yacyCore.seedDB.mySeed().hash, (String)e.getValue(), new Date(), 0, profile); - - // if something failed add the url into the errorURL list - if (rejectReason == null) { - c++; - } else { - plasmaCrawlZURL.Entry ee = switchboard.errorURL.newEntry(nexturlURL, rejectReason); - ee.store(); - switchboard.errorURL.stackPushEntry(ee); - } + switchboard.sbStackCrawlThread.enqueue( + nexturlURL, + null, + yacyCore.seedDB.mySeed().hash, + (String) e.getValue(), + new Date(), + 0, + profile, + true); } } catch (PatternSyntaxException e) { diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 1880a64c7..7c939010e 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -194,7 +194,8 @@ public final class plasmaCrawlStacker { String name, Date loadDate, int currentdepth, - plasmaCrawlProfile.entry profile) { + plasmaCrawlProfile.entry profile, + boolean first) { if (profile != null) try { this.queue.addMessage(new plasmaCrawlEntry( initiatorHash, @@ -206,7 +207,8 @@ public final class plasmaCrawlStacker { currentdepth, 0, 0 - )); + ), + first); } catch (Exception e) { e.printStackTrace(); } @@ -534,23 +536,27 @@ public final class plasmaCrawlStacker { this.urlEntryHashCache.clear(); } - public void addMessage(plasmaCrawlEntry newMessage) - throws InterruptedException, IOException { + public void addMessage(plasmaCrawlEntry newMessage, boolean first) throws InterruptedException, IOException { if (newMessage == null) throw new NullPointerException(); this.writeSync.P(); try { - boolean insertionDoneSuccessfully = false; + boolean insertionDone = false; synchronized(this.urlEntryHashCache) { kelondroRow.Entry oldValue = this.urlEntryCache.put(newMessage.toRow()); if (oldValue == null) { - insertionDoneSuccessfully = this.urlEntryHashCache.add(newMessage.url().hash()); + if (first) { + this.urlEntryHashCache.addFirst(newMessage.url().hash()); + } else { + this.urlEntryHashCache.addLast(newMessage.url().hash()); + } + insertionDone = true; } } - if (insertionDoneSuccessfully) { - this.readSync.V(); + if (insertionDone) { + this.readSync.V(); } } finally { this.writeSync.V(); @@ -560,7 +566,7 @@ public final class plasmaCrawlStacker { public int size() { synchronized(this.urlEntryHashCache) { return this.urlEntryHashCache.size(); - } + } } public int getDBType() { diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index ea5d444ad..14ae7e93e 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -2469,7 +2469,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser nextUrl = new yacyURL(nextUrlString, null); // enqueue the hyperlink into the pre-notice-url db - sbStackCrawlThread.enqueue(nextUrl, entry.urlHash(), initiatorPeerHash, (String) nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile()); + sbStackCrawlThread.enqueue(nextUrl, entry.urlHash(), initiatorPeerHash, (String) nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile(), entry.depth() <= 1); } catch (MalformedURLException e1) {} } log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) +