added referrer to remote crawl url list

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4236 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 18e516317d
commit 9b0ae4b989

@ -35,6 +35,7 @@ import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyNetwork; import de.anomic.yacy.yacyNetwork;
import de.anomic.yacy.yacyURL;
public class urls { public class urls {
@ -60,6 +61,7 @@ public class urls {
int count = Math.min(100, post.getInt("count", 0)); int count = Math.min(100, post.getInt("count", 0));
int c = 0; int c = 0;
plasmaCrawlEntry entry; plasmaCrawlEntry entry;
yacyURL referrer;
while ((count > 0) && (sb.crawlQueues.noticeURL.stackSize(stackType) > 0)) { while ((count > 0) && (sb.crawlQueues.noticeURL.stackSize(stackType) > 0)) {
try { try {
entry = sb.crawlQueues.noticeURL.pop(stackType, false); entry = sb.crawlQueues.noticeURL.pop(stackType, false);
@ -67,11 +69,14 @@ public class urls {
break; break;
} }
if (entry == null) break; if (entry == null) break;
// find referrer, if there is one
referrer = sb.getURL(entry.referrerhash());
// place url to notice-url db // place url to notice-url db
sb.crawlQueues.delegatedURL.push(sb.crawlQueues.delegatedURL.newEntry(entry.url(), "client=____________")); sb.crawlQueues.delegatedURL.push(sb.crawlQueues.delegatedURL.newEntry(entry.url(), "client=____________"));
// create RSS entry // create RSS entry
prop.put("item_" + c + "_title", ""); prop.put("item_" + c + "_title", "");
prop.putHTML("item_" + c + "_link", entry.url().toNormalform(true, false)); prop.putHTML("item_" + c + "_link", entry.url().toNormalform(true, false));
prop.putHTML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true, false));
prop.putHTML("item_" + c + "_description", entry.name()); prop.putHTML("item_" + c + "_description", entry.name());
prop.put("item_" + c + "_author", ""); prop.put("item_" + c + "_author", "");
prop.put("item_" + c + "_pubDate", serverDate.shortSecondTime(entry.appdate())); prop.put("item_" + c + "_pubDate", serverDate.shortSecondTime(entry.appdate()));

@ -21,6 +21,7 @@
<item> <item>
<title>#[title]#</title> <title>#[title]#</title>
<link>#[link]#</link> <link>#[link]#</link>
<referrer>#[referrer]#</referrer>
<description>#[description]#</description> <description>#[description]#</description>
<author>#[author]#</author> <author>#[author]#</author>
<pubDate>#[pubDate]#</pubDate> <pubDate>#[pubDate]#</pubDate>

@ -1454,10 +1454,16 @@ public final class httpc {
public void writeX(InputStream source, OutputStream procOS, OutputStream bufferOS) { public void writeX(InputStream source, OutputStream procOS, OutputStream bufferOS) {
byte[] buffer = new byte[2048]; byte[] buffer = new byte[2048];
int l, c = 0; int l, c = 0;
lastIO = System.currentTimeMillis();
while (true) try { io: while (true) try {
l = source.read(buffer, 0, buffer.length); l = source.read(buffer, 0, buffer.length);
if (l <= 0) break; if (l < 0) break;
if (l == 0) try {
if (System.currentTimeMillis() - lastIO > 30000) break;
this.wait(300);
continue io;
} catch (InterruptedException e) {} // may happen without EOF
lastIO = System.currentTimeMillis(); lastIO = System.currentTimeMillis();
c += l; c += l;
if (procOS != null) procOS.write(buffer, 0, l); if (procOS != null) procOS.write(buffer, 0, l);
@ -1479,10 +1485,16 @@ public final class httpc {
OutputStreamWriter bufferOSWriter = (bufferOS == null) ? null : new OutputStreamWriter(bufferOS,outputCharset); OutputStreamWriter bufferOSWriter = (bufferOS == null) ? null : new OutputStreamWriter(bufferOS,outputCharset);
char[] buffer = new char[2048]; char[] buffer = new char[2048];
int l, c= 0; int l, c= 0;
lastIO = System.currentTimeMillis();
while (true) try{
io: while (true) try{
l = sourceReader.read(buffer, 0, buffer.length); l = sourceReader.read(buffer, 0, buffer.length);
if (l <= 0) break; if (l < 0) break;
if (l == 0) try {
if (System.currentTimeMillis() - lastIO > 30000) break;
this.wait(300);
continue io;
} catch (InterruptedException e) {} // may happen without EOF
lastIO = System.currentTimeMillis(); lastIO = System.currentTimeMillis();
c += l; c += l;
if (procOS != null) procOS.write(buffer, 0, l); if (procOS != null) procOS.write(buffer, 0, l);

@ -238,6 +238,21 @@ public class plasmaCrawlQueues {
return false; return false;
} }
if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30)) {
log.logFine("remoteCrawlLoaderJob: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.sbQueue.size() + ")");
return false;
}
if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
log.logFine("remoteCrawlLoaderJob: too many processes in loader queue, dismissed (" + "cacheLoader=" + this.size() + ")");
return false;
}
if (sb.onlineCaution()) {
log.logFine("remoteCrawlLoaderJob: online caution, omitting processing");
return false;
}
// check if we have an entry in the provider list, otherwise fill the list // check if we have an entry in the provider list, otherwise fill the list
yacySeed seed; yacySeed seed;
if ((remoteCrawlProviderHashes.size() == 0) && if ((remoteCrawlProviderHashes.size() == 0) &&
@ -271,28 +286,32 @@ public class plasmaCrawlQueues {
if (reader == null) return true; if (reader == null) return true;
// parse the rss // parse the rss
rssReader.Item item; rssReader.Item item;
yacyURL url, referrer;
Date loaddate;
for (int i = 0; i < reader.items(); i++) { for (int i = 0; i < reader.items(); i++) {
item = reader.getItem(i); item = reader.getItem(i);
//System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate()); //System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate());
// put url on remote crawl stack // put url on remote crawl stack
yacyURL url;
try { try {
url = new yacyURL(item.getLink(), null); url = new yacyURL(item.getLink(), null);
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
url = null; url = null;
} }
Date loaddate; try {
referrer = new yacyURL(item.getReferrer(), null);
} catch (MalformedURLException e) {
referrer = null;
}
try { try {
loaddate = serverDate.parseShortSecondTime(item.getPubDate()); loaddate = serverDate.parseShortSecondTime(item.getPubDate());
} catch (ParseException e) { } catch (ParseException e) {
loaddate = new Date(); loaddate = new Date();
} }
yacyURL referrer = null; // referrer needed!
if (sb.acceptURL(url)) { if (sb.acceptURL(url)) {
// stack url // stack url
sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'"); sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
String reasonString = sb.crawlStacker.stackCrawl(url, referrer, hash, "REMOTE-CRAWLING", loaddate, 0, sb.defaultRemoteProfile); String reasonString = sb.crawlStacker.stackCrawl(url, referrer, hash, item.getDescription(), loaddate, 0, sb.defaultRemoteProfile);
if (reasonString == null) { if (reasonString == null) {
// done // done
@ -328,20 +347,18 @@ public class plasmaCrawlQueues {
return false; return false;
} }
if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30)) { if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30)) {
log.logFine("GlobalCrawl: too many processes in indexing queue, dismissed (" + log.logFine("GlobalCrawl: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.sbQueue.size() + ")");
"sbQueueSize=" + sb.sbQueue.size() + ")");
return false; return false;
} }
if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) { if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
log.logFine("GlobalCrawl: too many processes in loader queue, dismissed (" + log.logFine("GlobalCrawl: too many processes in loader queue, dismissed (" + "cacheLoader=" + this.size() + ")");
"cacheLoader=" + this.size() + ")");
return false; return false;
} }
if (sb.onlineCaution()) { if (sb.onlineCaution()) {
log.logFine("GlobalCrawl: online caution, omitting processing"); log.logFine("GlobalCrawl: online caution, omitting processing");
return false; return false;
} }
// if crawling was paused we have to wait until we wer notified to continue // if crawling was paused we have to wait until we wer notified to continue
Object[] status = (Object[]) sb.crawlJobsStatus.get(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); Object[] status = (Object[]) sb.crawlJobsStatus.get(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
synchronized(status[plasmaSwitchboard.CRAWLJOB_SYNC]) { synchronized(status[plasmaSwitchboard.CRAWLJOB_SYNC]) {

@ -1497,6 +1497,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
public yacyURL getURL(String urlhash) { public yacyURL getURL(String urlhash) {
if (urlhash == null) return null;
if (urlhash.equals(yacyURL.dummyHash)) return null; if (urlhash.equals(yacyURL.dummyHash)) return null;
yacyURL ne = crawlQueues.getURL(urlhash); yacyURL ne = crawlQueues.getURL(urlhash);
if (ne != null) return ne; if (ne != null) return ne;

@ -53,6 +53,7 @@ public class rssReader extends DefaultHandler {
"category", // "category", //
"title", // "title", //
"link", // "link", //
"referrer", //
"language", // "language", //
"description", // "description", //
"creator", // "creator", //
@ -245,6 +246,10 @@ public class rssReader extends DefaultHandler {
return (String) map.get("link"); return (String) map.get("link");
} }
public String getReferrer() {
return (String) map.get("referrer");
}
public String getLanguage() { public String getLanguage() {
return (String) map.get("language"); return (String) map.get("language");
} }

Loading…
Cancel
Save