From 021840e5bacf3602d13d5e695759a4444fa99930 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 17 May 2011 00:00:01 +0000 Subject: [PATCH] removed (almost) deadlocks and unnecessary CPU load git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7726 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/search/Switchboard.java | 4 ++-- source/de/anomic/yacy/graphics/WebStructureGraph.java | 3 +-- source/net/yacy/document/parser/html/ContentScraper.java | 5 ++++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 9156b82da..b48a2e6a3 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1989,7 +1989,7 @@ public final class Switchboard extends serverSwitch { assert in.queueEntry != null; assert in.documents != null; assert in.queueEntry != null; - webStructure.generateCitationReference(in.queueEntry.url(), in.documents[i], (in.condenser == null) ? null : in.condenser[i], in.queueEntry.lastModified()); // [outlinksSame, outlinksOther] + webStructure.generateCitationReference(in.queueEntry.url(), in.documents[i], (in.condenser == null) ? null : in.condenser[i]); // [outlinksSame, outlinksOther] } return in; } @@ -2174,7 +2174,7 @@ public final class Switchboard extends serverSwitch { } final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib); ResultImages.registerImages(url, document, true); - webStructure.generateCitationReference(url, document, condenser, response.lastModified()); + webStructure.generateCitationReference(url, document, condenser); storeDocumentIndex(process, response, document, condenser, searchEvent, "heuristic:" + heuristicName); log.logInfo("addToIndex fill of url " + url.toNormalform(true, true) + " finished"); } diff --git a/source/de/anomic/yacy/graphics/WebStructureGraph.java b/source/de/anomic/yacy/graphics/WebStructureGraph.java index 216144094..21f456973 100644 --- a/source/de/anomic/yacy/graphics/WebStructureGraph.java +++ b/source/de/anomic/yacy/graphics/WebStructureGraph.java @@ -32,7 +32,6 @@ import java.io.IOException; import java.text.ParseException; import java.util.ArrayList; import java.util.Collection; -import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -139,7 +138,7 @@ public class WebStructureGraph { } } - public void generateCitationReference(final DigestURI url, final Document document, final Condenser condenser, final Date docDate) { + public void generateCitationReference(final DigestURI url, final Document document, final Condenser condenser) { // generate citation reference final Map hl = document.getHyperlinks(); final Iterator it = hl.keySet().iterator(); diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 602f1c535..2815a62b9 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -243,12 +243,15 @@ public class ContentScraper extends AbstractScraper implements Scraper { String u; MultiProtocolURI url; while (s < b.length()) { + p = find(b, "://", s); + if (p == Integer.MAX_VALUE) break; + s = Math.max(0, p - 5); p = Math.min(find(b, "smb://", s), Math.min(find(b, "ftp://", s), Math.min(find(b, "http://", s), find(b, "https://", s)))); if (p == Integer.MAX_VALUE) break; q = b.indexOf(" ", p + 1); u = b.substring(p, q < 0 ? b.length() : q); if (u.endsWith(".")) u = u.substring(0, u.length() - 1); // remove the '.' that was appended above - s = p + 1; + s = p + 6; try { url = new MultiProtocolURI(u); anchors.put(url, new Properties());