From abf22f6e60eeb32d6ca3a40faf0805f386015e8f Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 11 Aug 2006 15:09:22 +0000 Subject: [PATCH] removed url normalform computation from htmlFilterContentScraper. This method was implemented in de.anomic.net.URL git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2377 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CacheAdmin_p.java | 2 +- htroot/IndexControl_p.java | 5 ++-- htroot/IndexCreate_p.java | 4 +-- htroot/QuickCrawlLink_p.java | 3 +-- htroot/yacy/crawlOrder.java | 6 ++--- source/de/anomic/data/robotsParser.java | 4 +-- .../htmlFilter/htmlFilterContentScraper.java | 9 +++---- .../de/anomic/index/indexEntryAttribute.java | 3 +-- source/de/anomic/index/indexURL.java | 9 +++---- source/de/anomic/kelondro/kelondroRow.java | 12 --------- source/de/anomic/net/URL.java | 26 ++++++++++++++----- .../de/anomic/plasma/plasmaCrawlWorker.java | 3 +-- source/de/anomic/plasma/plasmaHTCache.java | 3 +-- source/de/anomic/plasma/plasmaParser.java | 12 +++------ .../anomic/plasma/plasmaParserDocument.java | 7 ++--- .../de/anomic/plasma/plasmaSearchImages.java | 8 +++--- .../de/anomic/plasma/plasmaSnippetCache.java | 3 +-- .../de/anomic/plasma/plasmaSwitchboard.java | 8 +++--- .../anomic/plasma/plasmaSwitchboardQueue.java | 3 +-- 19 files changed, 60 insertions(+), 70 deletions(-) diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java index c6f051e11..77bed6ae1 100644 --- a/htroot/CacheAdmin_p.java +++ b/htroot/CacheAdmin_p.java @@ -97,7 +97,7 @@ public class CacheAdmin_p { prop.put("info", 0); path.append((pathString.length() == 0) ? linkPathString("/", true) : linkPathString(pathString, false)); - urlstr = htmlFilterContentScraper.urlNormalform(url); + urlstr = url.toNormalform(); prop.put("info_url", urlstr); info.ensureCapacity(40000); diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 63013c174..574c0565f 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -56,7 +56,6 @@ import java.util.Iterator; import java.util.Set; import java.util.TreeMap; -import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpHeader; import de.anomic.index.indexContainer; import de.anomic.index.indexEntry; @@ -214,7 +213,7 @@ public class IndexControl_p { try { plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash, null); URL url = entry.url(); - urlstring = htmlFilterContentScraper.urlNormalform(url); + urlstring = url.toNormalform(); prop.put("urlstring", ""); switchboard.urlPool.loadedURL.remove(urlhash); prop.put("result", "Removed URL " + urlstring); @@ -393,7 +392,7 @@ public class IndexControl_p { } if (url == null) { return "No entry found for URL-hash " + urlhash; } String result = "" + - "" + + "" + "" + "" + "" + diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index e5f9e968b..d26024631 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -141,7 +141,7 @@ public class IndexCreate_p { if (!(crawlingStart.startsWith("http"))) crawlingStart = "http://" + crawlingStart; // normalizing URL - crawlingStart = htmlFilterContentScraper.urlNormalform(null, crawlingStart); + try {crawlingStart = new URL(crawlingStart).toNormalform();} catch (MalformedURLException e1) {} // check if url is proper URL crawlingStartURL = null; @@ -243,7 +243,7 @@ public class IndexCreate_p { nexturlstring = nexturlstring.trim(); // normalizing URL - nexturlstring = htmlFilterContentScraper.urlNormalform(null, nexturlstring); + nexturlstring = new URL(nexturlstring).toNormalform(); // generating an url object URL nexturlURL = null; diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 93753be0c..0ce745935 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -54,7 +54,6 @@ import de.anomic.net.URL; import java.net.URLDecoder; import java.util.Date; -import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpHeader; import de.anomic.index.indexURL; import de.anomic.plasma.plasmaCrawlProfile; @@ -137,7 +136,7 @@ public class QuickCrawlLink_p { if (crawlingStart != null) { crawlingStart = crawlingStart.trim(); - crawlingStart = htmlFilterContentScraper.urlNormalform(null, crawlingStart); + try {crawlingStart = new URL(crawlingStart).toNormalform();} catch (MalformedURLException e1) {} // check if url is proper URL crawlingStartURL = null; diff --git a/htroot/yacy/crawlOrder.java b/htroot/yacy/crawlOrder.java index 7cad3b892..9af1f6a97 100644 --- a/htroot/yacy/crawlOrder.java +++ b/htroot/yacy/crawlOrder.java @@ -48,9 +48,9 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Date; -import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpHeader; import de.anomic.index.indexURL; +import de.anomic.net.URL; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; @@ -173,11 +173,11 @@ public final class crawlOrder { // old method: only one url // normalizing URL - String newURL = htmlFilterContentScraper.urlNormalform(null, (String)urlv.get(0)); + String newURL = new URL((String) urlv.get(0)).toNormalform(); if (!newURL.equals(urlv.get(0))) { env.getLog().logWarning("crawlOrder: Received not normalized URL " + urlv.get(0)); } - String refURL = htmlFilterContentScraper.urlNormalform(null, (String) refv.get(0)); + String refURL = new URL((String) refv.get(0)).toNormalform(); if ((refURL != null) && (!refURL.equals(refv.get(0)))) { env.getLog().logWarning("crawlOrder: Received not normalized Referer URL " + refv.get(0) + " of URL " + urlv.get(0)); } diff --git a/source/de/anomic/data/robotsParser.java b/source/de/anomic/data/robotsParser.java index a84f03ce2..6b210e230 100644 --- a/source/de/anomic/data/robotsParser.java +++ b/source/de/anomic/data/robotsParser.java @@ -282,7 +282,7 @@ public final class robotsParser{ } } - if (robotsTxt4Host.isDisallowed(nexturl.getFile())) { + if (robotsTxt4Host.isDisallowed(nexturl.getPath())) { return true; } return false; @@ -327,7 +327,7 @@ public final class robotsParser{ } - httpc.response res = con.GET(robotsURL.getFile(), reqHeaders); + httpc.response res = con.GET(robotsURL.getPath(), reqHeaders); if (res.status.startsWith("2")) { if (!res.responseHeader.mime().startsWith("text/plain")) { robotsTxt = null; diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 418b3e0d2..db116699e 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -43,7 +43,6 @@ package de.anomic.htmlFilter; -import de.anomic.server.logging.serverLog; import de.anomic.server.serverByteBuffer; import de.anomic.net.URL; @@ -55,8 +54,6 @@ import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Properties; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import java.util.TreeSet; public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper { @@ -117,7 +114,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen if ((content.length() != 0) && (content.byteAt(content.length() - 1) != 32)) content.append(32); content.append(super.stripAll(new serverByteBuffer(newtext, newtext.length + 1)).trim()).append(32); } - +/* public static String urlNormalform(URL url) { boolean defaultPort = false; // serverLog.logFinest("htmlFilter", "urlNormalform: '" + url.toString() + "'"); @@ -154,7 +151,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen return null; } } - + */ public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"'; public static String[] urlComps(String normalizedURL) { return normalizedURL.toLowerCase().split(splitrex); // word components of the url @@ -162,7 +159,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen private String absolutePath(String relativePath) { try { - return urlNormalform(new URL(root, relativePath)); + return new URL(root, relativePath).toString(); } catch (Exception e) { return ""; } diff --git a/source/de/anomic/index/indexEntryAttribute.java b/source/de/anomic/index/indexEntryAttribute.java index 6eaaa6651..e5e45ede6 100644 --- a/source/de/anomic/index/indexEntryAttribute.java +++ b/source/de/anomic/index/indexEntryAttribute.java @@ -30,7 +30,6 @@ package de.anomic.index; import de.anomic.net.URL; -import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.server.serverCodings; import de.anomic.yacy.yacySeedDB; @@ -93,7 +92,7 @@ public class indexEntryAttribute { // doctype calculation public static char docType(URL url) { - String path = htmlFilterContentScraper.urlNormalform(url); + String path = url.getPath(); // serverLog.logFinest("PLASMA", "docType URL=" + path); char doctype = doctype = indexEntryAttribute.DT_UNKNOWN; if (path.endsWith(".gif")) { doctype = indexEntryAttribute.DT_IMAGE; } diff --git a/source/de/anomic/index/indexURL.java b/source/de/anomic/index/indexURL.java index 85fc7c1d7..8dc692c19 100644 --- a/source/de/anomic/index/indexURL.java +++ b/source/de/anomic/index/indexURL.java @@ -32,7 +32,6 @@ import java.net.MalformedURLException; import java.text.SimpleDateFormat; import java.util.HashMap; -import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroTree; import de.anomic.server.serverCodings; @@ -501,7 +500,7 @@ public class indexURL { int domlengthKey = (l <= 8) ? 0 : (l <= 12) ? 1 : (l <= 16) ? 2 : 3; byte flagbyte = (byte) (((isHTTP) ? 0 : 32) | (id << 2) | domlengthKey); // form the 'local' part of the hash - String hash3 = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(htmlFilterContentScraper.urlNormalform(url))).substring(0, 5); + String hash3 = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(url.toNormalform())).substring(0, 5); char hash2 = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(subdom + ":" + port + ":" + rootpath)).charAt(0); // form the 'global' part of the hash String hash1 = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(url.getProtocol() + ":" + host + ":" + port)).substring(0, 5); @@ -529,13 +528,13 @@ public class indexURL { public static final String oldurlHash(URL url) { if (url == null) return null; - String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(htmlFilterContentScraper.urlNormalform(url))).substring(0, urlHashLength); + String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(url.toNormalform())).substring(0, urlHashLength); return hash; } - public static final String oldurlHash(String url) { + public static final String oldurlHash(String url) throws MalformedURLException { if ((url == null) || (url.length() < 10)) return null; - String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(htmlFilterContentScraper.urlNormalform(null, url))).substring(0, urlHashLength); + String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(new URL(url).toNormalform())).substring(0, urlHashLength); return hash; } diff --git a/source/de/anomic/kelondro/kelondroRow.java b/source/de/anomic/kelondro/kelondroRow.java index a7cdbbeb6..3990e7ca0 100644 --- a/source/de/anomic/kelondro/kelondroRow.java +++ b/source/de/anomic/kelondro/kelondroRow.java @@ -50,18 +50,6 @@ public class kelondroRow { } } - /* - public kelondroRow(int[] rowi) { - this.row = new kelondroColumn[rowi.length]; - this.colstart = new int[rowi.length]; - this.objectsize = 0; - for (int i = 0; i < rowi.length; i++) { - this.row[i] = new kelondroColumn("col_" + i, kelondroColumn.celltype_undefined, kelondroColumn.encoder_none, rowi[i], ""); - this.colstart[i] = this.objectsize; - this.objectsize += this.row[i].cellwidth(); - } - } - */ public kelondroRow(String structure) { // define row with row syntax // example: diff --git a/source/de/anomic/net/URL.java b/source/de/anomic/net/URL.java index b9e8c99b0..3c06cc96f 100644 --- a/source/de/anomic/net/URL.java +++ b/source/de/anomic/net/URL.java @@ -150,9 +150,13 @@ public class URL { } public String getFile() { + return getFile(true); + } + + public String getFile(boolean includeReference) { // this is the path plus quest plus ref if (quest != null) return path + "?" + quest; - if (ref != null) return path + "#" + ref; + if ((ref != null) && (includeReference)) return path + "#" + ref; return path; } @@ -188,7 +192,15 @@ public class URL { return quest; } + public String toNormalform() { + return toString(false); + } + public String toString() { + return toString(true); + } + + public String toString(boolean includeReference) { // generates a normal form of the URL boolean defaultPort = false; if (this.protocol.equals("http")) { @@ -198,7 +210,7 @@ public class URL { } else if (this.protocol.equals("https")) { if (this.port < 0 || this.port == 443) { defaultPort = true; } } - String path = this.getFile(); + String path = this.getFile(includeReference); if (path.length() == 0 || path.charAt(0) != '/') { path = "/" + path; } @@ -208,8 +220,9 @@ public class URL { path = matcher.replaceAll(""); matcher.reset(path); } - - return this.protocol + "://" + this.getHost().toLowerCase() + ((defaultPort) ? "" : (":" + this.port)) + getFile(); + + if (defaultPort) { return this.protocol + "://" + this.getHost().toLowerCase() + path; } + return this.protocol + "://" + this.getHost().toLowerCase() + ((defaultPort) ? "" : (":" + this.port)) + path; } public boolean equals(URL other) { @@ -233,7 +246,8 @@ public class URL { public static void main(String[] args) { URL u; - try {u = new URL("http://www.anomic.de/home/test?x=1#home"); System.out.println(u.toString());} catch (MalformedURLException e) {} - + try {u = new URL("http://www.anomic.de/home/test?x=1#home"); System.out.println("toString=" + u.toString() + "\ntoNormalform=" + u.toNormalform());} catch (MalformedURLException e) {} + try {u = new URL("http://www.anomic.de/home/test?x=1"); System.out.println("toString=" + u.toString() + "\ntoNormalform=" + u.toNormalform());} catch (MalformedURLException e) {} + try {u = new URL("http://www.anomic.de/home/test#home"); System.out.println("toString=" + u.toString() + "\ntoNormalform=" + u.toNormalform());} catch (MalformedURLException e) {} } } diff --git a/source/de/anomic/plasma/plasmaCrawlWorker.java b/source/de/anomic/plasma/plasmaCrawlWorker.java index 86b469cfe..89c9833dd 100644 --- a/source/de/anomic/plasma/plasmaCrawlWorker.java +++ b/source/de/anomic/plasma/plasmaCrawlWorker.java @@ -53,7 +53,6 @@ import java.net.SocketException; import de.anomic.net.URL; import java.net.UnknownHostException; import java.util.Date; -import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpHeader; import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpc; @@ -427,7 +426,7 @@ public final class plasmaCrawlWorker extends Thread { } // normalizing URL - redirectionUrlString = htmlFilterContentScraper.urlNormalform(url, redirectionUrlString); + redirectionUrlString = new URL(url, redirectionUrlString).toNormalform(); // generating the new URL object URL redirectionUrl = new URL(redirectionUrlString); diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 3e2a69df1..237defdd5 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -53,7 +53,6 @@ package de.anomic.plasma; -import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpc; import de.anomic.http.httpHeader; import de.anomic.index.indexEntryAttribute; @@ -727,7 +726,7 @@ public final class plasmaHTCache { // normalize url // serverLog.logFine("PLASMA", "Entry: URL=" + url.toString()); - this.nomalizedURLString = htmlFilterContentScraper.urlNormalform(url); + this.nomalizedURLString = url.toNormalform(); try { this.url = new URL(this.nomalizedURLString); diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 661bb69b6..bd085f096 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -298,16 +298,10 @@ public final class plasmaParser { public static String getFileExt(URL url) { // getting the file path - String name = url.getFile(); - - // chopping http parameters from the url - int p = name.lastIndexOf('?'); - if (p != -1) { - name = name.substring(0,p); - } + String name = url.getPath(); // tetermining last position of / in the file path - p = name.lastIndexOf('/'); + int p = name.lastIndexOf('/'); if (p != -1) { name = name.substring(p); } @@ -574,7 +568,7 @@ public final class plasmaParser { String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length]; int p = 0; for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j]; - plasmaParserDocument ppd = new plasmaParserDocument(new URL(htmlFilterContentScraper.urlNormalform(location)), + plasmaParserDocument ppd = new plasmaParserDocument(new URL(location.toNormalform()), mimeType, null, null, scraper.getTitle(), sections, null, scraper.getText(), scraper.getAnchors(), scraper.getImages()); diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index 8b1e0e3f6..b3e3feb95 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -42,7 +42,6 @@ package de.anomic.plasma; -import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterImageEntry; import java.io.ByteArrayInputStream; @@ -192,7 +191,9 @@ public class plasmaParserDocument { } else { ext = url.substring(extpos).toLowerCase(); } - normal = htmlFilterContentScraper.urlNormalform(null, url); + try {normal = new URL(url).toNormalform();} catch (MalformedURLException e1) { + normal = null; + } if (normal != null) { //TODO: extension function is not correct if (plasmaParser.mediaExtContains(ext.substring(1))) { // this is not a normal anchor, its a media link @@ -216,7 +217,7 @@ public class plasmaParserDocument { htmlFilterImageEntry iEntry; while (i.hasNext()) { iEntry = (htmlFilterImageEntry) i.next(); - normal = htmlFilterContentScraper.urlNormalform(iEntry.url()); + normal = iEntry.url().toNormalform(); if (normal != null) medialinks.put(normal, iEntry.alt()); // avoid NullPointerException } diff --git a/source/de/anomic/plasma/plasmaSearchImages.java b/source/de/anomic/plasma/plasmaSearchImages.java index 8b437e0bb..3782ff752 100644 --- a/source/de/anomic/plasma/plasmaSearchImages.java +++ b/source/de/anomic/plasma/plasmaSearchImages.java @@ -47,7 +47,6 @@ import java.util.Iterator; import java.util.Map; import java.util.TreeSet; -import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.server.serverDate; @@ -72,10 +71,13 @@ public final class plasmaSearchImages { Iterator i = hl.entrySet().iterator(); while (i.hasNext()) { Map.Entry e = (Map.Entry) i.next(); - String nexturlstring = htmlFilterContentScraper.urlNormalform(null, (String) e.getKey()); + String nexturlstring; try { + nexturlstring = new URL((String) e.getKey()).toNormalform(); addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), new URL(nexturlstring), depth - 1)); - } catch (MalformedURLException e2) {} + } catch (MalformedURLException e1) { + e1.printStackTrace(); + } } } } diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 4bda60f06..f354db86b 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -54,7 +54,6 @@ import de.anomic.http.httpHeader; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacySearch; -import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexURL; @@ -438,7 +437,7 @@ public class plasmaSnippetCache { while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) { urlentry = acc.nextElement(); if (urlentry.url().getHost().endsWith(".yacyh")) continue; - urlstring = htmlFilterContentScraper.urlNormalform(urlentry.url()); + urlstring = urlentry.url().toNormalform(); if ((urlstring.matches(urlmask)) && (!(existsInCache(urlentry.url(), queryhashes)))) { new Fetcher(urlentry.url(), queryhashes).start(); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 9a521e208..d477426ca 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -107,6 +107,8 @@ import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.net.InetAddress; +import java.net.MalformedURLException; + import de.anomic.net.URL; import java.net.URLEncoder; import java.text.SimpleDateFormat; @@ -1402,7 +1404,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser while (i.hasNext()) { e = (Map.Entry) i.next(); nexturlstring = (String) e.getKey(); - nexturlstring = htmlFilterContentScraper.urlNormalform(null, nexturlstring); + try {nexturlstring = new URL(nexturlstring).toNormalform();} catch (MalformedURLException e1) {} sbStackCrawlThread.enqueue(nexturlstring, entry.url().toString(), initiatorHash, (String) e.getValue(), docDate, entry.depth() + 1, entry.profile()); @@ -1883,9 +1885,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser url = new URL("http://" + address + "/" + host.substring(0, p) + filename); urlname = "http://share." + seed.getName() + ".yacy" + filename; if ((p = urlname.indexOf("?")) > 0) urlname = urlname.substring(0, p); - urlstring = htmlFilterContentScraper.urlNormalform(url); + urlstring = url.toNormalform(); } else { - urlstring = htmlFilterContentScraper.urlNormalform(url); + urlstring = url.toNormalform(); urlname = urlstring; } descr = urlentry.descr(); diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java index bf9ef13b8..984caa675 100644 --- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java +++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java @@ -44,7 +44,6 @@ package de.anomic.plasma; -import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpHeader; import de.anomic.index.indexURL; import de.anomic.kelondro.kelondroBase64Order; @@ -276,7 +275,7 @@ public class plasmaSwitchboardQueue { } public String normalizedURLString() { - return htmlFilterContentScraper.urlNormalform(url); + return url.toNormalform(); } public String urlHash() {
URL String" + htmlFilterContentScraper.urlNormalform(url) + "
URL String" + url.toNormalform() + "
Hash" + urlhash + "
Description" + entry.descr() + "
Modified-Date" + entry.moddate() + "