From 9b0ae4b9899129bed74117499904d33af66dc758 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 29 Nov 2007 13:58:00 +0000 Subject: [PATCH] added referrer to remote crawl url list git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4236 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/yacy/urls.java | 5 +++ htroot/yacy/urls.xml | 1 + source/de/anomic/http/httpc.java | 22 +++++++++--- .../plasma/crawler/plasmaCrawlQueues.java | 35 ++++++++++++++----- .../de/anomic/plasma/plasmaSwitchboard.java | 1 + source/de/anomic/xml/rssReader.java | 5 +++ 6 files changed, 55 insertions(+), 14 deletions(-) diff --git a/htroot/yacy/urls.java b/htroot/yacy/urls.java index 4fcbea6f7..cb50d7c98 100644 --- a/htroot/yacy/urls.java +++ b/htroot/yacy/urls.java @@ -35,6 +35,7 @@ import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacyNetwork; +import de.anomic.yacy.yacyURL; public class urls { @@ -60,6 +61,7 @@ public class urls { int count = Math.min(100, post.getInt("count", 0)); int c = 0; plasmaCrawlEntry entry; + yacyURL referrer; while ((count > 0) && (sb.crawlQueues.noticeURL.stackSize(stackType) > 0)) { try { entry = sb.crawlQueues.noticeURL.pop(stackType, false); @@ -67,11 +69,14 @@ public class urls { break; } if (entry == null) break; + // find referrer, if there is one + referrer = sb.getURL(entry.referrerhash()); // place url to notice-url db sb.crawlQueues.delegatedURL.push(sb.crawlQueues.delegatedURL.newEntry(entry.url(), "client=____________")); // create RSS entry prop.put("item_" + c + "_title", ""); prop.putHTML("item_" + c + "_link", entry.url().toNormalform(true, false)); + prop.putHTML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true, false)); prop.putHTML("item_" + c + "_description", entry.name()); prop.put("item_" + c + "_author", ""); prop.put("item_" + c + "_pubDate", serverDate.shortSecondTime(entry.appdate())); diff --git a/htroot/yacy/urls.xml b/htroot/yacy/urls.xml index e4adbf389..e67adb907 100644 --- a/htroot/yacy/urls.xml +++ b/htroot/yacy/urls.xml @@ -21,6 +21,7 @@ #[title]# #[link]# +#[referrer]# #[description]# #[author]# #[pubDate]# diff --git a/source/de/anomic/http/httpc.java b/source/de/anomic/http/httpc.java index 04fc115f1..e5bcdb9d3 100644 --- a/source/de/anomic/http/httpc.java +++ b/source/de/anomic/http/httpc.java @@ -1454,10 +1454,16 @@ public final class httpc { public void writeX(InputStream source, OutputStream procOS, OutputStream bufferOS) { byte[] buffer = new byte[2048]; int l, c = 0; + lastIO = System.currentTimeMillis(); - while (true) try { + io: while (true) try { l = source.read(buffer, 0, buffer.length); - if (l <= 0) break; + if (l < 0) break; + if (l == 0) try { + if (System.currentTimeMillis() - lastIO > 30000) break; + this.wait(300); + continue io; + } catch (InterruptedException e) {} // may happen without EOF lastIO = System.currentTimeMillis(); c += l; if (procOS != null) procOS.write(buffer, 0, l); @@ -1479,10 +1485,16 @@ public final class httpc { OutputStreamWriter bufferOSWriter = (bufferOS == null) ? null : new OutputStreamWriter(bufferOS,outputCharset); char[] buffer = new char[2048]; int l, c= 0; - - while (true) try{ + lastIO = System.currentTimeMillis(); + + io: while (true) try{ l = sourceReader.read(buffer, 0, buffer.length); - if (l <= 0) break; + if (l < 0) break; + if (l == 0) try { + if (System.currentTimeMillis() - lastIO > 30000) break; + this.wait(300); + continue io; + } catch (InterruptedException e) {} // may happen without EOF lastIO = System.currentTimeMillis(); c += l; if (procOS != null) procOS.write(buffer, 0, l); diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java b/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java index 3a8edfcba..e30a00a90 100644 --- a/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java +++ b/source/de/anomic/plasma/crawler/plasmaCrawlQueues.java @@ -238,6 +238,21 @@ public class plasmaCrawlQueues { return false; } + if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30)) { + log.logFine("remoteCrawlLoaderJob: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.sbQueue.size() + ")"); + return false; + } + + if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) { + log.logFine("remoteCrawlLoaderJob: too many processes in loader queue, dismissed (" + "cacheLoader=" + this.size() + ")"); + return false; + } + + if (sb.onlineCaution()) { + log.logFine("remoteCrawlLoaderJob: online caution, omitting processing"); + return false; + } + // check if we have an entry in the provider list, otherwise fill the list yacySeed seed; if ((remoteCrawlProviderHashes.size() == 0) && @@ -271,28 +286,32 @@ public class plasmaCrawlQueues { if (reader == null) return true; // parse the rss rssReader.Item item; + yacyURL url, referrer; + Date loaddate; for (int i = 0; i < reader.items(); i++) { item = reader.getItem(i); //System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate()); // put url on remote crawl stack - yacyURL url; try { url = new yacyURL(item.getLink(), null); } catch (MalformedURLException e) { url = null; } - Date loaddate; + try { + referrer = new yacyURL(item.getReferrer(), null); + } catch (MalformedURLException e) { + referrer = null; + } try { loaddate = serverDate.parseShortSecondTime(item.getPubDate()); } catch (ParseException e) { loaddate = new Date(); } - yacyURL referrer = null; // referrer needed! if (sb.acceptURL(url)) { // stack url sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'"); - String reasonString = sb.crawlStacker.stackCrawl(url, referrer, hash, "REMOTE-CRAWLING", loaddate, 0, sb.defaultRemoteProfile); + String reasonString = sb.crawlStacker.stackCrawl(url, referrer, hash, item.getDescription(), loaddate, 0, sb.defaultRemoteProfile); if (reasonString == null) { // done @@ -328,20 +347,18 @@ public class plasmaCrawlQueues { return false; } if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30)) { - log.logFine("GlobalCrawl: too many processes in indexing queue, dismissed (" + - "sbQueueSize=" + sb.sbQueue.size() + ")"); + log.logFine("GlobalCrawl: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.sbQueue.size() + ")"); return false; } if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) { - log.logFine("GlobalCrawl: too many processes in loader queue, dismissed (" + - "cacheLoader=" + this.size() + ")"); + log.logFine("GlobalCrawl: too many processes in loader queue, dismissed (" + "cacheLoader=" + this.size() + ")"); return false; } if (sb.onlineCaution()) { log.logFine("GlobalCrawl: online caution, omitting processing"); return false; } - + // if crawling was paused we have to wait until we wer notified to continue Object[] status = (Object[]) sb.crawlJobsStatus.get(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); synchronized(status[plasmaSwitchboard.CRAWLJOB_SYNC]) { diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index be762399d..695929f38 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1497,6 +1497,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } public yacyURL getURL(String urlhash) { + if (urlhash == null) return null; if (urlhash.equals(yacyURL.dummyHash)) return null; yacyURL ne = crawlQueues.getURL(urlhash); if (ne != null) return ne; diff --git a/source/de/anomic/xml/rssReader.java b/source/de/anomic/xml/rssReader.java index 4616ec200..3c3db8d66 100644 --- a/source/de/anomic/xml/rssReader.java +++ b/source/de/anomic/xml/rssReader.java @@ -53,6 +53,7 @@ public class rssReader extends DefaultHandler { "category", // "title", // "link", // + "referrer", // "language", // "description", // "creator", // @@ -245,6 +246,10 @@ public class rssReader extends DefaultHandler { return (String) map.get("link"); } + public String getReferrer() { + return (String) map.get("referrer"); + } + public String getLanguage() { return (String) map.get("language"); }