From 5a490aa06571c1790182938eb7dc216a47154c36 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 16 Jun 2005 21:49:56 +0000 Subject: [PATCH] fixed html parser git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@289 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/yacy/hello.java | 29 ++++++++++--------- .../htmlFilter/htmlFilterContentScraper.java | 18 +++++++----- .../htmlFilterContentTransformer.java | 2 -- .../de/anomic/plasma/plasmaSwitchboard.java | 2 +- source/de/anomic/yacy/yacySeedDB.java | 12 ++++---- 5 files changed, 35 insertions(+), 28 deletions(-) diff --git a/htroot/yacy/hello.java b/htroot/yacy/hello.java index 2240bd601..5241593b1 100644 --- a/htroot/yacy/hello.java +++ b/htroot/yacy/hello.java @@ -78,24 +78,27 @@ public class hello { if (remoteSeed == null) return new serverObjects(); // we easily know the caller's IP: - String yourip = (String) header.get("CLIENTIP", ""); // read an artificial header addendum - //System.out.println("YACYHELLO: YOUR IP=" + yourip); - prop.put("yourip", yourip); - - // now let's check if the calling peer can be reached and answers - int urls = -1; + String clientip = (String) header.get("CLIENTIP", ""); // read an artificial header addendum String reportedip = remoteSeed.get("IP", ""); - remoteSeed.put("IP", yourip); - urls = yacyClient.queryUrlCount(remoteSeed); + float clientversion = remoteSeed.getVersion(); - // if this was not successful, we try to use the reported ip - if ((urls < 0) && (!(reportedip.equals(yourip)))) { - // the other peer does not respond under the ip it reported - // we try again using the ip we got from the http header + int urls = -1; + if ((!(clientip.equals(reportedip))) && (clientversion >= (float)0.383)) { + // try first the reportedip, since this may be a connect from a port-forwarding host + prop.put("yourip", reportedip); remoteSeed.put("IP", reportedip); urls = yacyClient.queryUrlCount(remoteSeed); } + if (urls < 0) { + // if the previous attempt was not successful, try the ip where the request came from + prop.put("yourip", clientip); + remoteSeed.put("IP", clientip); + urls = yacyClient.queryUrlCount(remoteSeed); + } + + //System.out.println("YACYHELLO: YOUR IP=" + clientip); + // assign status if (urls >= 0) { if (remoteSeed.get("PeerType", "senior") == null) { @@ -114,7 +117,7 @@ public class hello { remoteSeed.put("LastSeen", yacyCore.universalDateShortString()); yacyCore.peerActions.juniorConnects++; // update statistics remoteSeed.put("PeerType", "junior"); - yacyCore.log.logInfo("hello: responded remote junior peer '" + remoteSeed.getName() + "' from " + yourip + ":" + remoteSeed.get("Port", "8080")); + yacyCore.log.logInfo("hello: responded remote junior peer '" + remoteSeed.getName() + "' from " + remoteSeed.getAddress()); // no connection here, instead store junior in connection cache if ((remoteSeed.hash != null) && (remoteSeed.isProper())) yacyCore.peerActions.peerPing(remoteSeed); } diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 168e7c876..c8a1ee39b 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -52,7 +52,7 @@ import de.anomic.server.serverByteBuffer; public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper { - // statics: for initialisation of the HTMLFilterAbstractTransformer + // statics: for initialisation of the HTMLFilterAbstractScraper private static HashSet linkTags0; private static HashSet linkTags1; @@ -122,10 +122,10 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen public void scrapeTag1(String tagname, Properties tagopts, byte[] text) { //System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text)); - if ((tagname.equals("a")) && (text.length < 2048)) { - byte[] a = super.stripAll(new serverByteBuffer(text)).getBytes(); - anchors.put(absolutePath(tagopts.getProperty("href", "")), new serverByteBuffer(a).trim().toString()); - } + //if (tagname.equals("a")) anchors.put(absolutePath(tagopts.getProperty("href", "")), new serverByteBuffer(super.stripAll(new serverByteBuffer(text)).getBytes()).trim().toString()); + //if (tagname.equals("h1")) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes()); + //if (tagname.equals("title")) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes()); + if ((tagname.equals("a")) && (text.length < 2048)) anchors.put(absolutePath(tagopts.getProperty("href", "")), new serverByteBuffer(super.stripAll(new serverByteBuffer(text)).getBytes()).trim().toString()); if ((tagname.equals("h1")) && (text.length < 512)) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes()); if ((tagname.equals("title")) && (text.length < 512)) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes()); } @@ -166,8 +166,12 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen public void close() { // free resources super.close(); - linkTags0 = null; - linkTags1 = null; + anchors = null; + images = null; + title = null; + headline = null; + text = null; + root = null; } public void print() { diff --git a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java index 39ffe3c86..a744c0b9d 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java @@ -131,8 +131,6 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer public void close() { // free resources super.close(); - linkTags0 = null; - linkTags1 = null; } } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 2d06b7547..d23fd613a 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -148,7 +148,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // load slots - public static int crawlSlots = 8; + public static int crawlSlots = 20; // couloured list management public static TreeSet blueList = null; diff --git a/source/de/anomic/yacy/yacySeedDB.java b/source/de/anomic/yacy/yacySeedDB.java index 82aeb7660..b4f4b515b 100644 --- a/source/de/anomic/yacy/yacySeedDB.java +++ b/source/de/anomic/yacy/yacySeedDB.java @@ -147,14 +147,16 @@ public class yacySeedDB { } private synchronized kelondroMap openSeedTable(File seedDBFile) throws IOException { - if (seedDBFile.exists()) { + if (seedDBFile.exists()) try { // open existing seed database return new kelondroMap(new kelondroDyn(seedDBFile, seedDBBufferKB * 0x400), sortFields, accFields); - } else { - // create new seed database - new File(seedDBFile.getParent()).mkdir(); - return new kelondroMap(new kelondroDyn(seedDBFile, seedDBBufferKB * 0x400, commonHashLength, 480), sortFields, accFields); + } catch (kelondroException e) { + // if we have an error, we start with a fresh database + if (seedDBFile.exists()) seedDBFile.delete(); } + // create new seed database + new File(seedDBFile.getParent()).mkdir(); + return new kelondroMap(new kelondroDyn(seedDBFile, seedDBBufferKB * 0x400, commonHashLength, 480), sortFields, accFields); } private synchronized kelondroMap resetSeedTable(kelondroMap seedDB, File seedDBFile) {