fixed html parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@289 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent a25b5b4986
commit 5a490aa065

@ -78,24 +78,27 @@ public class hello {
if (remoteSeed == null) return new serverObjects();
// we easily know the caller's IP:
String yourip = (String) header.get("CLIENTIP", "<unknown>"); // read an artificial header addendum
//System.out.println("YACYHELLO: YOUR IP=" + yourip);
prop.put("yourip", yourip);
// now let's check if the calling peer can be reached and answers
int urls = -1;
String clientip = (String) header.get("CLIENTIP", "<unknown>"); // read an artificial header addendum
String reportedip = remoteSeed.get("IP", "");
remoteSeed.put("IP", yourip);
urls = yacyClient.queryUrlCount(remoteSeed);
float clientversion = remoteSeed.getVersion();
// if this was not successful, we try to use the reported ip
if ((urls < 0) && (!(reportedip.equals(yourip)))) {
// the other peer does not respond under the ip it reported
// we try again using the ip we got from the http header
int urls = -1;
if ((!(clientip.equals(reportedip))) && (clientversion >= (float)0.383)) {
// try first the reportedip, since this may be a connect from a port-forwarding host
prop.put("yourip", reportedip);
remoteSeed.put("IP", reportedip);
urls = yacyClient.queryUrlCount(remoteSeed);
}
if (urls < 0) {
// if the previous attempt was not successful, try the ip where the request came from
prop.put("yourip", clientip);
remoteSeed.put("IP", clientip);
urls = yacyClient.queryUrlCount(remoteSeed);
}
//System.out.println("YACYHELLO: YOUR IP=" + clientip);
// assign status
if (urls >= 0) {
if (remoteSeed.get("PeerType", "senior") == null) {
@ -114,7 +117,7 @@ public class hello {
remoteSeed.put("LastSeen", yacyCore.universalDateShortString());
yacyCore.peerActions.juniorConnects++; // update statistics
remoteSeed.put("PeerType", "junior");
yacyCore.log.logInfo("hello: responded remote junior peer '" + remoteSeed.getName() + "' from " + yourip + ":" + remoteSeed.get("Port", "8080"));
yacyCore.log.logInfo("hello: responded remote junior peer '" + remoteSeed.getName() + "' from " + remoteSeed.getAddress());
// no connection here, instead store junior in connection cache
if ((remoteSeed.hash != null) && (remoteSeed.isProper())) yacyCore.peerActions.peerPing(remoteSeed);
}

@ -52,7 +52,7 @@ import de.anomic.server.serverByteBuffer;
public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper {
// statics: for initialisation of the HTMLFilterAbstractTransformer
// statics: for initialisation of the HTMLFilterAbstractScraper
private static HashSet linkTags0;
private static HashSet linkTags1;
@ -122,10 +122,10 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
//System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
if ((tagname.equals("a")) && (text.length < 2048)) {
byte[] a = super.stripAll(new serverByteBuffer(text)).getBytes();
anchors.put(absolutePath(tagopts.getProperty("href", "")), new serverByteBuffer(a).trim().toString());
}
//if (tagname.equals("a")) anchors.put(absolutePath(tagopts.getProperty("href", "")), new serverByteBuffer(super.stripAll(new serverByteBuffer(text)).getBytes()).trim().toString());
//if (tagname.equals("h1")) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
//if (tagname.equals("title")) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
if ((tagname.equals("a")) && (text.length < 2048)) anchors.put(absolutePath(tagopts.getProperty("href", "")), new serverByteBuffer(super.stripAll(new serverByteBuffer(text)).getBytes()).trim().toString());
if ((tagname.equals("h1")) && (text.length < 512)) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
if ((tagname.equals("title")) && (text.length < 512)) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
}
@ -166,8 +166,12 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
public void close() {
// free resources
super.close();
linkTags0 = null;
linkTags1 = null;
anchors = null;
images = null;
title = null;
headline = null;
text = null;
root = null;
}
public void print() {

@ -131,8 +131,6 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
public void close() {
// free resources
super.close();
linkTags0 = null;
linkTags1 = null;
}
}

@ -148,7 +148,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// load slots
public static int crawlSlots = 8;
public static int crawlSlots = 20;
// couloured list management
public static TreeSet blueList = null;

@ -147,14 +147,16 @@ public class yacySeedDB {
}
private synchronized kelondroMap openSeedTable(File seedDBFile) throws IOException {
if (seedDBFile.exists()) {
if (seedDBFile.exists()) try {
// open existing seed database
return new kelondroMap(new kelondroDyn(seedDBFile, seedDBBufferKB * 0x400), sortFields, accFields);
} else {
// create new seed database
new File(seedDBFile.getParent()).mkdir();
return new kelondroMap(new kelondroDyn(seedDBFile, seedDBBufferKB * 0x400, commonHashLength, 480), sortFields, accFields);
} catch (kelondroException e) {
// if we have an error, we start with a fresh database
if (seedDBFile.exists()) seedDBFile.delete();
}
// create new seed database
new File(seedDBFile.getParent()).mkdir();
return new kelondroMap(new kelondroDyn(seedDBFile, seedDBBufferKB * 0x400, commonHashLength, 480), sortFields, accFields);
}
private synchronized kelondroMap resetSeedTable(kelondroMap seedDB, File seedDBFile) {

Loading…
Cancel
Save