another enhancement to crawl starts with link files

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4123 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent ed2ca8fc4c
commit 98abe0804d

@ -276,7 +276,6 @@ public class WatchCrawler_p {
// loop through the contained links
Iterator linkiterator = hyperlinks.entrySet().iterator();
int c = 0;
while (linkiterator.hasNext()) {
Map.Entry e = (Map.Entry) linkiterator.next();
String nexturlstring = (String) e.getKey();
@ -294,21 +293,19 @@ public class WatchCrawler_p {
nexturlURL = new yacyURL(nexturlstring, null);
} catch (MalformedURLException ex) {
nexturlURL = null;
c++;
continue;
}
// enqueuing the url for crawling
String rejectReason = switchboard.sbStackCrawlThread.stackCrawl(nexturlstring, null, yacyCore.seedDB.mySeed().hash, (String)e.getValue(), new Date(), 0, profile);
// if something failed add the url into the errorURL list
if (rejectReason == null) {
c++;
} else {
plasmaCrawlZURL.Entry ee = switchboard.errorURL.newEntry(nexturlURL, rejectReason);
ee.store();
switchboard.errorURL.stackPushEntry(ee);
}
switchboard.sbStackCrawlThread.enqueue(
nexturlURL,
null,
yacyCore.seedDB.mySeed().hash,
(String) e.getValue(),
new Date(),
0,
profile,
true);
}
} catch (PatternSyntaxException e) {

@ -194,7 +194,8 @@ public final class plasmaCrawlStacker {
String name,
Date loadDate,
int currentdepth,
plasmaCrawlProfile.entry profile) {
plasmaCrawlProfile.entry profile,
boolean first) {
if (profile != null) try {
this.queue.addMessage(new plasmaCrawlEntry(
initiatorHash,
@ -206,7 +207,8 @@ public final class plasmaCrawlStacker {
currentdepth,
0,
0
));
),
first);
} catch (Exception e) {
e.printStackTrace();
}
@ -534,23 +536,27 @@ public final class plasmaCrawlStacker {
this.urlEntryHashCache.clear();
}
public void addMessage(plasmaCrawlEntry newMessage)
throws InterruptedException, IOException {
public void addMessage(plasmaCrawlEntry newMessage, boolean first) throws InterruptedException, IOException {
if (newMessage == null) throw new NullPointerException();
this.writeSync.P();
try {
boolean insertionDoneSuccessfully = false;
boolean insertionDone = false;
synchronized(this.urlEntryHashCache) {
kelondroRow.Entry oldValue = this.urlEntryCache.put(newMessage.toRow());
if (oldValue == null) {
insertionDoneSuccessfully = this.urlEntryHashCache.add(newMessage.url().hash());
if (first) {
this.urlEntryHashCache.addFirst(newMessage.url().hash());
} else {
this.urlEntryHashCache.addLast(newMessage.url().hash());
}
insertionDone = true;
}
}
if (insertionDoneSuccessfully) {
this.readSync.V();
if (insertionDone) {
this.readSync.V();
}
} finally {
this.writeSync.V();
@ -560,7 +566,7 @@ public final class plasmaCrawlStacker {
public int size() {
synchronized(this.urlEntryHashCache) {
return this.urlEntryHashCache.size();
}
}
}
public int getDBType() {

@ -2469,7 +2469,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
nextUrl = new yacyURL(nextUrlString, null);
// enqueue the hyperlink into the pre-notice-url db
sbStackCrawlThread.enqueue(nextUrl, entry.urlHash(), initiatorPeerHash, (String) nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile());
sbStackCrawlThread.enqueue(nextUrl, entry.urlHash(), initiatorPeerHash, (String) nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile(), entry.depth() <= 1);
} catch (MalformedURLException e1) {}
}
log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) +

Loading…
Cancel
Save