|
|
|
@ -3,7 +3,10 @@
|
|
|
|
|
// (C) by Michael Peter Christen; mc@anomic.de
|
|
|
|
|
// first published on http://www.anomic.de
|
|
|
|
|
// Frankfurt, Germany, 2004
|
|
|
|
|
// last major change: 18.02.2004
|
|
|
|
|
//
|
|
|
|
|
// $LastChangedDate$
|
|
|
|
|
// $LastChangedRevision$
|
|
|
|
|
// $LastChangedBy$
|
|
|
|
|
//
|
|
|
|
|
// This program is free software; you can redistribute it and/or modify
|
|
|
|
|
// it under the terms of the GNU General Public License as published by
|
|
|
|
@ -48,32 +51,31 @@ import java.util.Locale;
|
|
|
|
|
import java.util.Map;
|
|
|
|
|
import java.util.Properties;
|
|
|
|
|
import java.util.TreeSet;
|
|
|
|
|
|
|
|
|
|
import de.anomic.server.logging.serverLog;
|
|
|
|
|
import de.anomic.server.serverByteBuffer;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// statics: for initialisation of the HTMLFilterAbstractScraper
|
|
|
|
|
private static TreeSet linkTags0;
|
|
|
|
|
private static TreeSet linkTags1;
|
|
|
|
|
|
|
|
|
|
private static final Collator insensitiveCollator = Collator.getInstance(Locale.US);
|
|
|
|
|
static {
|
|
|
|
|
insensitiveCollator.setStrength(Collator.SECONDARY);
|
|
|
|
|
insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
|
|
|
|
|
insensitiveCollator.setStrength(Collator.SECONDARY);
|
|
|
|
|
insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static {
|
|
|
|
|
linkTags0 = new TreeSet(insensitiveCollator);
|
|
|
|
|
linkTags0.add("img");
|
|
|
|
|
linkTags0 = new TreeSet(insensitiveCollator);
|
|
|
|
|
linkTags0.add("img");
|
|
|
|
|
linkTags0.add("base");
|
|
|
|
|
linkTags0.add("frame");
|
|
|
|
|
|
|
|
|
|
linkTags1 = new TreeSet(insensitiveCollator);
|
|
|
|
|
linkTags1.add("a");
|
|
|
|
|
linkTags1.add("h1");
|
|
|
|
|
linkTags1.add("title");
|
|
|
|
|
linkTags1 = new TreeSet(insensitiveCollator);
|
|
|
|
|
linkTags1.add("a");
|
|
|
|
|
linkTags1.add("h1");
|
|
|
|
|
linkTags1.add("title");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// class variables: collectors for links
|
|
|
|
@ -87,19 +89,19 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
|
|
|
|
|
public htmlFilterContentScraper(URL root) {
|
|
|
|
|
// the root value here will not be used to load the resource.
|
|
|
|
|
// it is only the reference for relative links
|
|
|
|
|
super(linkTags0, linkTags1);
|
|
|
|
|
this.root = root;
|
|
|
|
|
this.anchors = new HashMap();
|
|
|
|
|
this.images = new HashMap();
|
|
|
|
|
this.title = "";
|
|
|
|
|
this.headline = "";
|
|
|
|
|
this.text = new serverByteBuffer(1024);
|
|
|
|
|
super(linkTags0, linkTags1);
|
|
|
|
|
this.root = root;
|
|
|
|
|
this.anchors = new HashMap();
|
|
|
|
|
this.images = new HashMap();
|
|
|
|
|
this.title = "";
|
|
|
|
|
this.headline = "";
|
|
|
|
|
this.text = new serverByteBuffer(1024);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void scrapeText(byte[] newtext) {
|
|
|
|
|
//System.out.println("SCRAPE: " + new String(newtext));
|
|
|
|
|
if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append(32);
|
|
|
|
|
text.append(super.stripAll(new serverByteBuffer(newtext, newtext.length + 1)).trim()).append(32);
|
|
|
|
|
// System.out.println("SCRAPE: " + new String(newtext));
|
|
|
|
|
if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append(32);
|
|
|
|
|
text.append(super.stripAll(new serverByteBuffer(newtext, newtext.length + 1)).trim()).append(32);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static String urlNormalform(URL url) {
|
|
|
|
@ -108,82 +110,99 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static String urlNormalform(String us) {
|
|
|
|
|
if (us == null) return null;
|
|
|
|
|
if (us.length() == 0) return null;
|
|
|
|
|
serverLog.logFiner("htmlFilter", "urlNormalform: IN=" + us);
|
|
|
|
|
if (us == null) { return null; }
|
|
|
|
|
if (us.length() == 0) { return null; }
|
|
|
|
|
|
|
|
|
|
/* TODO: what about
|
|
|
|
|
* - case insensitive domain names
|
|
|
|
|
* - chars that should be escaped in URLs
|
|
|
|
|
*/
|
|
|
|
|
int p;
|
|
|
|
|
|
|
|
|
|
// cutting of everything behind #
|
|
|
|
|
if ((p = us.indexOf("#")) >= 0) us = us.substring(0, p);
|
|
|
|
|
|
|
|
|
|
if (us.startsWith("https")) {
|
|
|
|
|
if (us.endsWith(":443")) us = us.substring(0, us.length() - 4);
|
|
|
|
|
p = us.indexOf(":443/");
|
|
|
|
|
if (p >= 0) us = us.substring(0,p).concat(us.substring(p + 4));
|
|
|
|
|
} else if (us.startsWith("http")) {
|
|
|
|
|
if (us.endsWith(":80")) us = us.substring(0, us.length() - 3);
|
|
|
|
|
p = us.indexOf(":80/");
|
|
|
|
|
if (p >= 0) us = us.substring(0,p).concat(us.substring(p + 3));
|
|
|
|
|
int cpos = us.indexOf("#");
|
|
|
|
|
if (cpos >= 0) { us = us.substring(0, cpos); }
|
|
|
|
|
|
|
|
|
|
if (us.startsWith("http")) {
|
|
|
|
|
if (us.endsWith(":80")) {
|
|
|
|
|
us = us.substring(0, us.length() - 3);
|
|
|
|
|
serverLog.logFinest("htmlFilter", "urlNormalform: :80=" + us);
|
|
|
|
|
} else {
|
|
|
|
|
cpos = us.indexOf(":80/");
|
|
|
|
|
if (cpos >= 0) {
|
|
|
|
|
us = us.substring(0, cpos).concat(us.substring(cpos + 3));
|
|
|
|
|
serverLog.logFinest("htmlFilter", "urlNormalform: :80/=" + us);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else if (us.startsWith("https")) {
|
|
|
|
|
if (us.endsWith(":443")) {
|
|
|
|
|
us = us.substring(0, us.length() - 4);
|
|
|
|
|
serverLog.logFinest("htmlFilter", "urlNormalform: :443=" + us);
|
|
|
|
|
} else {
|
|
|
|
|
cpos = us.indexOf(":443/");
|
|
|
|
|
if (cpos >= 0) {
|
|
|
|
|
us = us.substring(0, cpos).concat(us.substring(cpos + 4));
|
|
|
|
|
serverLog.logFinest("htmlFilter", "urlNormalform: :443/=" + us);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (((us.endsWith("/")) && (us.lastIndexOf('/', us.length() - 2) < 8))) us = us.substring(0, us.length() - 1);
|
|
|
|
|
serverLog.logFiner("htmlFilter", "urlNormalform: OUT=" + us);
|
|
|
|
|
return us;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private String absolutePath(String relativePath) {
|
|
|
|
|
try {
|
|
|
|
|
return urlNormalform(new URL(root, relativePath));
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
return "";
|
|
|
|
|
}
|
|
|
|
|
try {
|
|
|
|
|
return urlNormalform(new URL(root, relativePath));
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
return "";
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void scrapeTag0(String tagname, Properties tagopts) {
|
|
|
|
|
if (tagname.equalsIgnoreCase("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt",""));
|
|
|
|
|
if (tagname.equalsIgnoreCase("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt",""));
|
|
|
|
|
if (tagname.equalsIgnoreCase("base")) try {root = new URL(tagopts.getProperty("href", ""));} catch (MalformedURLException e) {}
|
|
|
|
|
if (tagname.equalsIgnoreCase("frame")) anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name",""));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
|
|
|
|
|
//System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
|
|
|
|
|
if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) anchors.put(absolutePath(tagopts.getProperty("href", "")), super.stripAll(new serverByteBuffer(text)).trim().toString());
|
|
|
|
|
if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) headline = super.stripAll(new serverByteBuffer(text)).toString();
|
|
|
|
|
if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = super.stripAll(new serverByteBuffer(text)).toString();
|
|
|
|
|
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
|
|
|
|
|
if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) anchors.put(absolutePath(tagopts.getProperty("href", "")), super.stripAll(new serverByteBuffer(text)).trim().toString());
|
|
|
|
|
if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) headline = super.stripAll(new serverByteBuffer(text)).toString();
|
|
|
|
|
if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = super.stripAll(new serverByteBuffer(text)).toString();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String getHeadline() {
|
|
|
|
|
String hl = "";
|
|
|
|
|
String hl = "";
|
|
|
|
|
|
|
|
|
|
// extract headline from content
|
|
|
|
|
if (title.length() > 0) hl = title.trim();
|
|
|
|
|
else if (headline.length() > 0) hl = headline.trim();
|
|
|
|
|
else if (text.length() > 80) hl = new String(text.getBytes(), 0, 80).trim();
|
|
|
|
|
else hl = text.trim().toString();
|
|
|
|
|
if (title.length() > 0) hl = title.trim();
|
|
|
|
|
else if (headline.length() > 0) hl = headline.trim();
|
|
|
|
|
else if (text.length() > 80) hl = new String(text.getBytes(), 0, 80).trim();
|
|
|
|
|
else hl = text.trim().toString();
|
|
|
|
|
|
|
|
|
|
// clean the line: may contain too many funny symbols
|
|
|
|
|
for (int i = 0; i < hl.length(); i++)
|
|
|
|
|
if (hl.charAt(i) < ' ') hl = hl.substring(0, i) + " " + hl.substring(i + 1);
|
|
|
|
|
|
|
|
|
|
// clean the line: remove double-spaces
|
|
|
|
|
int p;
|
|
|
|
|
while ((p = hl.indexOf(" ")) >= 0) hl = hl.substring(0, p) + hl.substring(p + 1);
|
|
|
|
|
|
|
|
|
|
// return result
|
|
|
|
|
return hl.trim();
|
|
|
|
|
return hl.trim();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public byte[] getText() {
|
|
|
|
|
return text.getBytes();
|
|
|
|
|
return text.getBytes();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public Map getAnchors() {
|
|
|
|
|
return anchors;
|
|
|
|
|
return anchors;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public Map getImages() {
|
|
|
|
|
return images;
|
|
|
|
|
return images;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void close() {
|
|
|
|
@ -198,16 +217,15 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void print() {
|
|
|
|
|
System.out.println("TITLE :" + title);
|
|
|
|
|
System.out.println("HEADLINE:" + headline);
|
|
|
|
|
System.out.println("ANCHORS :" + anchors.toString());
|
|
|
|
|
System.out.println("IMAGES :" + images.toString());
|
|
|
|
|
System.out.println("TEXT :" + new String(text.getBytes()));
|
|
|
|
|
System.out.println("TITLE :" + title);
|
|
|
|
|
System.out.println("HEADLINE:" + headline);
|
|
|
|
|
System.out.println("ANCHORS :" + anchors.toString());
|
|
|
|
|
System.out.println("IMAGES :" + images.toString());
|
|
|
|
|
System.out.println("TEXT :" + new String(text.getBytes()));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public static void main(String[] args) {
|
|
|
|
|
String test = "Nokia kürzt bei Forschung und Entwicklung";
|
|
|
|
|
String test = "Nokia kürzt bei Forschung und Entwicklung";
|
|
|
|
|
try {
|
|
|
|
|
htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL("http://localhost"));
|
|
|
|
|
scraper.scrapeText(test.getBytes());
|
|
|
|
|