removed url normalform computation from htmlFilterContentScraper.

This method was implemented in de.anomic.net.URL


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2377 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 740d49751d
commit abf22f6e60

@ -97,7 +97,7 @@ public class CacheAdmin_p {
prop.put("info", 0);
path.append((pathString.length() == 0) ? linkPathString("/", true) : linkPathString(pathString, false));
urlstr = htmlFilterContentScraper.urlNormalform(url);
urlstr = url.toNormalform();
prop.put("info_url", urlstr);
info.ensureCapacity(40000);

@ -56,7 +56,6 @@ import java.util.Iterator;
import java.util.Set;
import java.util.TreeMap;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
@ -214,7 +213,7 @@ public class IndexControl_p {
try {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash, null);
URL url = entry.url();
urlstring = htmlFilterContentScraper.urlNormalform(url);
urlstring = url.toNormalform();
prop.put("urlstring", "");
switchboard.urlPool.loadedURL.remove(urlhash);
prop.put("result", "Removed URL " + urlstring);
@ -393,7 +392,7 @@ public class IndexControl_p {
}
if (url == null) { return "No entry found for URL-hash " + urlhash; }
String result = "<table>" +
"<tr><td class=\"small\">URL String</td><td class=\"tt\">" + htmlFilterContentScraper.urlNormalform(url) + "</td></tr>" +
"<tr><td class=\"small\">URL String</td><td class=\"tt\">" + url.toNormalform() + "</td></tr>" +
"<tr><td class=\"small\">Hash</td><td class=\"tt\">" + urlhash + "</td></tr>" +
"<tr><td class=\"small\">Description</td><td class=\"tt\">" + entry.descr() + "</td></tr>" +
"<tr><td class=\"small\">Modified-Date</td><td class=\"tt\">" + entry.moddate() + "</td></tr>" +

@ -141,7 +141,7 @@ public class IndexCreate_p {
if (!(crawlingStart.startsWith("http"))) crawlingStart = "http://" + crawlingStart;
// normalizing URL
crawlingStart = htmlFilterContentScraper.urlNormalform(null, crawlingStart);
try {crawlingStart = new URL(crawlingStart).toNormalform();} catch (MalformedURLException e1) {}
// check if url is proper
URL crawlingStartURL = null;
@ -243,7 +243,7 @@ public class IndexCreate_p {
nexturlstring = nexturlstring.trim();
// normalizing URL
nexturlstring = htmlFilterContentScraper.urlNormalform(null, nexturlstring);
nexturlstring = new URL(nexturlstring).toNormalform();
// generating an url object
URL nexturlURL = null;

@ -54,7 +54,6 @@ import de.anomic.net.URL;
import java.net.URLDecoder;
import java.util.Date;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaCrawlProfile;
@ -137,7 +136,7 @@ public class QuickCrawlLink_p {
if (crawlingStart != null) {
crawlingStart = crawlingStart.trim();
crawlingStart = htmlFilterContentScraper.urlNormalform(null, crawlingStart);
try {crawlingStart = new URL(crawlingStart).toNormalform();} catch (MalformedURLException e1) {}
// check if url is proper
URL crawlingStartURL = null;

@ -48,9 +48,9 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURL;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
@ -173,11 +173,11 @@ public final class crawlOrder {
// old method: only one url
// normalizing URL
String newURL = htmlFilterContentScraper.urlNormalform(null, (String)urlv.get(0));
String newURL = new URL((String) urlv.get(0)).toNormalform();
if (!newURL.equals(urlv.get(0))) {
env.getLog().logWarning("crawlOrder: Received not normalized URL " + urlv.get(0));
}
String refURL = htmlFilterContentScraper.urlNormalform(null, (String) refv.get(0));
String refURL = new URL((String) refv.get(0)).toNormalform();
if ((refURL != null) && (!refURL.equals(refv.get(0)))) {
env.getLog().logWarning("crawlOrder: Received not normalized Referer URL " + refv.get(0) + " of URL " + urlv.get(0));
}

@ -282,7 +282,7 @@ public final class robotsParser{
}
}
if (robotsTxt4Host.isDisallowed(nexturl.getFile())) {
if (robotsTxt4Host.isDisallowed(nexturl.getPath())) {
return true;
}
return false;
@ -327,7 +327,7 @@ public final class robotsParser{
}
httpc.response res = con.GET(robotsURL.getFile(), reqHeaders);
httpc.response res = con.GET(robotsURL.getPath(), reqHeaders);
if (res.status.startsWith("2")) {
if (!res.responseHeader.mime().startsWith("text/plain")) {
robotsTxt = null;

@ -43,7 +43,6 @@
package de.anomic.htmlFilter;
import de.anomic.server.logging.serverLog;
import de.anomic.server.serverByteBuffer;
import de.anomic.net.URL;
@ -55,8 +54,6 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.TreeSet;
public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper {
@ -117,7 +114,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
if ((content.length() != 0) && (content.byteAt(content.length() - 1) != 32)) content.append(32);
content.append(super.stripAll(new serverByteBuffer(newtext, newtext.length + 1)).trim()).append(32);
}
/*
public static String urlNormalform(URL url) {
boolean defaultPort = false;
// serverLog.logFinest("htmlFilter", "urlNormalform: '" + url.toString() + "'");
@ -154,7 +151,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return null;
}
}
*/
public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
public static String[] urlComps(String normalizedURL) {
return normalizedURL.toLowerCase().split(splitrex); // word components of the url
@ -162,7 +159,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
private String absolutePath(String relativePath) {
try {
return urlNormalform(new URL(root, relativePath));
return new URL(root, relativePath).toString();
} catch (Exception e) {
return "";
}

@ -30,7 +30,6 @@ package de.anomic.index;
import de.anomic.net.URL;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.server.serverCodings;
import de.anomic.yacy.yacySeedDB;
@ -93,7 +92,7 @@ public class indexEntryAttribute {
// doctype calculation
public static char docType(URL url) {
String path = htmlFilterContentScraper.urlNormalform(url);
String path = url.getPath();
// serverLog.logFinest("PLASMA", "docType URL=" + path);
char doctype = doctype = indexEntryAttribute.DT_UNKNOWN;
if (path.endsWith(".gif")) { doctype = indexEntryAttribute.DT_IMAGE; }

@ -32,7 +32,6 @@ import java.net.MalformedURLException;
import java.text.SimpleDateFormat;
import java.util.HashMap;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroTree;
import de.anomic.server.serverCodings;
@ -501,7 +500,7 @@ public class indexURL {
int domlengthKey = (l <= 8) ? 0 : (l <= 12) ? 1 : (l <= 16) ? 2 : 3;
byte flagbyte = (byte) (((isHTTP) ? 0 : 32) | (id << 2) | domlengthKey);
// form the 'local' part of the hash
String hash3 = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(htmlFilterContentScraper.urlNormalform(url))).substring(0, 5);
String hash3 = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(url.toNormalform())).substring(0, 5);
char hash2 = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(subdom + ":" + port + ":" + rootpath)).charAt(0);
// form the 'global' part of the hash
String hash1 = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(url.getProtocol() + ":" + host + ":" + port)).substring(0, 5);
@ -529,13 +528,13 @@ public class indexURL {
public static final String oldurlHash(URL url) {
if (url == null) return null;
String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(htmlFilterContentScraper.urlNormalform(url))).substring(0, urlHashLength);
String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(url.toNormalform())).substring(0, urlHashLength);
return hash;
}
public static final String oldurlHash(String url) {
public static final String oldurlHash(String url) throws MalformedURLException {
if ((url == null) || (url.length() < 10)) return null;
String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(htmlFilterContentScraper.urlNormalform(null, url))).substring(0, urlHashLength);
String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(new URL(url).toNormalform())).substring(0, urlHashLength);
return hash;
}

@ -50,18 +50,6 @@ public class kelondroRow {
}
}
/*
public kelondroRow(int[] rowi) {
this.row = new kelondroColumn[rowi.length];
this.colstart = new int[rowi.length];
this.objectsize = 0;
for (int i = 0; i < rowi.length; i++) {
this.row[i] = new kelondroColumn("col_" + i, kelondroColumn.celltype_undefined, kelondroColumn.encoder_none, rowi[i], "");
this.colstart[i] = this.objectsize;
this.objectsize += this.row[i].cellwidth();
}
}
*/
public kelondroRow(String structure) {
// define row with row syntax
// example:

@ -150,9 +150,13 @@ public class URL {
}
public String getFile() {
return getFile(true);
}
public String getFile(boolean includeReference) {
// this is the path plus quest plus ref
if (quest != null) return path + "?" + quest;
if (ref != null) return path + "#" + ref;
if ((ref != null) && (includeReference)) return path + "#" + ref;
return path;
}
@ -188,7 +192,15 @@ public class URL {
return quest;
}
public String toNormalform() {
return toString(false);
}
public String toString() {
return toString(true);
}
public String toString(boolean includeReference) {
// generates a normal form of the URL
boolean defaultPort = false;
if (this.protocol.equals("http")) {
@ -198,7 +210,7 @@ public class URL {
} else if (this.protocol.equals("https")) {
if (this.port < 0 || this.port == 443) { defaultPort = true; }
}
String path = this.getFile();
String path = this.getFile(includeReference);
if (path.length() == 0 || path.charAt(0) != '/') { path = "/" + path; }
@ -208,8 +220,9 @@ public class URL {
path = matcher.replaceAll("");
matcher.reset(path);
}
return this.protocol + "://" + this.getHost().toLowerCase() + ((defaultPort) ? "" : (":" + this.port)) + getFile();
if (defaultPort) { return this.protocol + "://" + this.getHost().toLowerCase() + path; }
return this.protocol + "://" + this.getHost().toLowerCase() + ((defaultPort) ? "" : (":" + this.port)) + path;
}
public boolean equals(URL other) {
@ -233,7 +246,8 @@ public class URL {
public static void main(String[] args) {
URL u;
try {u = new URL("http://www.anomic.de/home/test?x=1#home"); System.out.println(u.toString());} catch (MalformedURLException e) {}
try {u = new URL("http://www.anomic.de/home/test?x=1#home"); System.out.println("toString=" + u.toString() + "\ntoNormalform=" + u.toNormalform());} catch (MalformedURLException e) {}
try {u = new URL("http://www.anomic.de/home/test?x=1"); System.out.println("toString=" + u.toString() + "\ntoNormalform=" + u.toNormalform());} catch (MalformedURLException e) {}
try {u = new URL("http://www.anomic.de/home/test#home"); System.out.println("toString=" + u.toString() + "\ntoNormalform=" + u.toNormalform());} catch (MalformedURLException e) {}
}
}

@ -53,7 +53,6 @@ import java.net.SocketException;
import de.anomic.net.URL;
import java.net.UnknownHostException;
import java.util.Date;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpc;
@ -427,7 +426,7 @@ public final class plasmaCrawlWorker extends Thread {
}
// normalizing URL
redirectionUrlString = htmlFilterContentScraper.urlNormalform(url, redirectionUrlString);
redirectionUrlString = new URL(url, redirectionUrlString).toNormalform();
// generating the new URL object
URL redirectionUrl = new URL(redirectionUrlString);

@ -53,7 +53,6 @@
package de.anomic.plasma;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpc;
import de.anomic.http.httpHeader;
import de.anomic.index.indexEntryAttribute;
@ -727,7 +726,7 @@ public final class plasmaHTCache {
// normalize url
// serverLog.logFine("PLASMA", "Entry: URL=" + url.toString());
this.nomalizedURLString = htmlFilterContentScraper.urlNormalform(url);
this.nomalizedURLString = url.toNormalform();
try {
this.url = new URL(this.nomalizedURLString);

@ -298,16 +298,10 @@ public final class plasmaParser {
public static String getFileExt(URL url) {
// getting the file path
String name = url.getFile();
// chopping http parameters from the url
int p = name.lastIndexOf('?');
if (p != -1) {
name = name.substring(0,p);
}
String name = url.getPath();
// tetermining last position of / in the file path
p = name.lastIndexOf('/');
int p = name.lastIndexOf('/');
if (p != -1) {
name = name.substring(p);
}
@ -574,7 +568,7 @@ public final class plasmaParser {
String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length];
int p = 0;
for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];
plasmaParserDocument ppd = new plasmaParserDocument(new URL(htmlFilterContentScraper.urlNormalform(location)),
plasmaParserDocument ppd = new plasmaParserDocument(new URL(location.toNormalform()),
mimeType, null, null, scraper.getTitle(),
sections, null,
scraper.getText(), scraper.getAnchors(), scraper.getImages());

@ -42,7 +42,6 @@
package de.anomic.plasma;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import java.io.ByteArrayInputStream;
@ -192,7 +191,9 @@ public class plasmaParserDocument {
} else {
ext = url.substring(extpos).toLowerCase();
}
normal = htmlFilterContentScraper.urlNormalform(null, url);
try {normal = new URL(url).toNormalform();} catch (MalformedURLException e1) {
normal = null;
}
if (normal != null) { //TODO: extension function is not correct
if (plasmaParser.mediaExtContains(ext.substring(1))) {
// this is not a normal anchor, its a media link
@ -216,7 +217,7 @@ public class plasmaParserDocument {
htmlFilterImageEntry iEntry;
while (i.hasNext()) {
iEntry = (htmlFilterImageEntry) i.next();
normal = htmlFilterContentScraper.urlNormalform(iEntry.url());
normal = iEntry.url().toNormalform();
if (normal != null) medialinks.put(normal, iEntry.alt()); // avoid NullPointerException
}

@ -47,7 +47,6 @@ import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.server.serverDate;
@ -72,10 +71,13 @@ public final class plasmaSearchImages {
Iterator i = hl.entrySet().iterator();
while (i.hasNext()) {
Map.Entry e = (Map.Entry) i.next();
String nexturlstring = htmlFilterContentScraper.urlNormalform(null, (String) e.getKey());
String nexturlstring;
try {
nexturlstring = new URL((String) e.getKey()).toNormalform();
addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), new URL(nexturlstring), depth - 1));
} catch (MalformedURLException e2) {}
} catch (MalformedURLException e1) {
e1.printStackTrace();
}
}
}
}

@ -54,7 +54,6 @@ import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySearch;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
@ -438,7 +437,7 @@ public class plasmaSnippetCache {
while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) {
urlentry = acc.nextElement();
if (urlentry.url().getHost().endsWith(".yacyh")) continue;
urlstring = htmlFilterContentScraper.urlNormalform(urlentry.url());
urlstring = urlentry.url().toNormalform();
if ((urlstring.matches(urlmask)) &&
(!(existsInCache(urlentry.url(), queryhashes)))) {
new Fetcher(urlentry.url(), queryhashes).start();

@ -107,6 +107,8 @@ import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import de.anomic.net.URL;
import java.net.URLEncoder;
import java.text.SimpleDateFormat;
@ -1402,7 +1404,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
while (i.hasNext()) {
e = (Map.Entry) i.next();
nexturlstring = (String) e.getKey();
nexturlstring = htmlFilterContentScraper.urlNormalform(null, nexturlstring);
try {nexturlstring = new URL(nexturlstring).toNormalform();} catch (MalformedURLException e1) {}
sbStackCrawlThread.enqueue(nexturlstring, entry.url().toString(), initiatorHash, (String) e.getValue(), docDate, entry.depth() + 1, entry.profile());
@ -1883,9 +1885,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
url = new URL("http://" + address + "/" + host.substring(0, p) + filename);
urlname = "http://share." + seed.getName() + ".yacy" + filename;
if ((p = urlname.indexOf("?")) > 0) urlname = urlname.substring(0, p);
urlstring = htmlFilterContentScraper.urlNormalform(url);
urlstring = url.toNormalform();
} else {
urlstring = htmlFilterContentScraper.urlNormalform(url);
urlstring = url.toNormalform();
urlname = urlstring;
}
descr = urlentry.descr();

@ -44,7 +44,6 @@
package de.anomic.plasma;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroBase64Order;
@ -276,7 +275,7 @@ public class plasmaSwitchboardQueue {
}
public String normalizedURLString() {
return htmlFilterContentScraper.urlNormalform(url);
return url.toNormalform();
}
public String urlHash() {

Loading…
Cancel
Save