diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index c6f7db521..a1ebdbcc8 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -3,7 +3,10 @@
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
-// last major change: 18.02.2004
+//
+// $LastChangedDate$
+// $LastChangedRevision$
+// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
@@ -48,32 +51,31 @@ import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.TreeSet;
-
+import de.anomic.server.logging.serverLog;
import de.anomic.server.serverByteBuffer;
-
public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper {
-
// statics: for initialisation of the HTMLFilterAbstractScraper
private static TreeSet linkTags0;
private static TreeSet linkTags1;
+
private static final Collator insensitiveCollator = Collator.getInstance(Locale.US);
static {
- insensitiveCollator.setStrength(Collator.SECONDARY);
- insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
+ insensitiveCollator.setStrength(Collator.SECONDARY);
+ insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
}
-
+
static {
- linkTags0 = new TreeSet(insensitiveCollator);
- linkTags0.add("img");
+ linkTags0 = new TreeSet(insensitiveCollator);
+ linkTags0.add("img");
linkTags0.add("base");
linkTags0.add("frame");
- linkTags1 = new TreeSet(insensitiveCollator);
- linkTags1.add("a");
- linkTags1.add("h1");
- linkTags1.add("title");
+ linkTags1 = new TreeSet(insensitiveCollator);
+ linkTags1.add("a");
+ linkTags1.add("h1");
+ linkTags1.add("title");
}
// class variables: collectors for links
@@ -87,103 +89,120 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
public htmlFilterContentScraper(URL root) {
// the root value here will not be used to load the resource.
// it is only the reference for relative links
- super(linkTags0, linkTags1);
- this.root = root;
- this.anchors = new HashMap();
- this.images = new HashMap();
- this.title = "";
- this.headline = "";
- this.text = new serverByteBuffer(1024);
+ super(linkTags0, linkTags1);
+ this.root = root;
+ this.anchors = new HashMap();
+ this.images = new HashMap();
+ this.title = "";
+ this.headline = "";
+ this.text = new serverByteBuffer(1024);
}
public void scrapeText(byte[] newtext) {
- //System.out.println("SCRAPE: " + new String(newtext));
- if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append(32);
- text.append(super.stripAll(new serverByteBuffer(newtext, newtext.length + 1)).trim()).append(32);
+// System.out.println("SCRAPE: " + new String(newtext));
+ if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append(32);
+ text.append(super.stripAll(new serverByteBuffer(newtext, newtext.length + 1)).trim()).append(32);
}
public static String urlNormalform(URL url) {
if (url == null) return null;
return urlNormalform(url.toString());
}
-
+
public static String urlNormalform(String us) {
- if (us == null) return null;
- if (us.length() == 0) return null;
-
+ serverLog.logFiner("htmlFilter", "urlNormalform: IN=" + us);
+ if (us == null) { return null; }
+ if (us.length() == 0) { return null; }
+
/* TODO: what about
* - case insensitive domain names
* - chars that should be escaped in URLs
*/
- int p;
-
+
// cutting of everything behind #
- if ((p = us.indexOf("#")) >= 0) us = us.substring(0, p);
-
- if (us.startsWith("https")) {
- if (us.endsWith(":443")) us = us.substring(0, us.length() - 4);
- p = us.indexOf(":443/");
- if (p >= 0) us = us.substring(0,p).concat(us.substring(p + 4));
- } else if (us.startsWith("http")) {
- if (us.endsWith(":80")) us = us.substring(0, us.length() - 3);
- p = us.indexOf(":80/");
- if (p >= 0) us = us.substring(0,p).concat(us.substring(p + 3));
- }
+ int cpos = us.indexOf("#");
+ if (cpos >= 0) { us = us.substring(0, cpos); }
+
+ if (us.startsWith("http")) {
+ if (us.endsWith(":80")) {
+ us = us.substring(0, us.length() - 3);
+ serverLog.logFinest("htmlFilter", "urlNormalform: :80=" + us);
+ } else {
+ cpos = us.indexOf(":80/");
+ if (cpos >= 0) {
+ us = us.substring(0, cpos).concat(us.substring(cpos + 3));
+ serverLog.logFinest("htmlFilter", "urlNormalform: :80/=" + us);
+ }
+ }
+ } else if (us.startsWith("https")) {
+ if (us.endsWith(":443")) {
+ us = us.substring(0, us.length() - 4);
+ serverLog.logFinest("htmlFilter", "urlNormalform: :443=" + us);
+ } else {
+ cpos = us.indexOf(":443/");
+ if (cpos >= 0) {
+ us = us.substring(0, cpos).concat(us.substring(cpos + 4));
+ serverLog.logFinest("htmlFilter", "urlNormalform: :443/=" + us);
+ }
+ }
+ }
if (((us.endsWith("/")) && (us.lastIndexOf('/', us.length() - 2) < 8))) us = us.substring(0, us.length() - 1);
+ serverLog.logFiner("htmlFilter", "urlNormalform: OUT=" + us);
return us;
- }
-
+ }
+
private String absolutePath(String relativePath) {
- try {
- return urlNormalform(new URL(root, relativePath));
- } catch (Exception e) {
- return "";
- }
+ try {
+ return urlNormalform(new URL(root, relativePath));
+ } catch (Exception e) {
+ return "";
+ }
}
public void scrapeTag0(String tagname, Properties tagopts) {
- if (tagname.equalsIgnoreCase("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt",""));
+ if (tagname.equalsIgnoreCase("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt",""));
if (tagname.equalsIgnoreCase("base")) try {root = new URL(tagopts.getProperty("href", ""));} catch (MalformedURLException e) {}
if (tagname.equalsIgnoreCase("frame")) anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name",""));
}
public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
- //System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
- if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) anchors.put(absolutePath(tagopts.getProperty("href", "")), super.stripAll(new serverByteBuffer(text)).trim().toString());
- if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) headline = super.stripAll(new serverByteBuffer(text)).toString();
- if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = super.stripAll(new serverByteBuffer(text)).toString();
+// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
+ if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) anchors.put(absolutePath(tagopts.getProperty("href", "")), super.stripAll(new serverByteBuffer(text)).trim().toString());
+ if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) headline = super.stripAll(new serverByteBuffer(text)).toString();
+ if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = super.stripAll(new serverByteBuffer(text)).toString();
}
public String getHeadline() {
- String hl = "";
+ String hl = "";
// extract headline from content
- if (title.length() > 0) hl = title.trim();
- else if (headline.length() > 0) hl = headline.trim();
- else if (text.length() > 80) hl = new String(text.getBytes(), 0, 80).trim();
- else hl = text.trim().toString();
+ if (title.length() > 0) hl = title.trim();
+ else if (headline.length() > 0) hl = headline.trim();
+ else if (text.length() > 80) hl = new String(text.getBytes(), 0, 80).trim();
+ else hl = text.trim().toString();
// clean the line: may contain too many funny symbols
for (int i = 0; i < hl.length(); i++)
if (hl.charAt(i) < ' ') hl = hl.substring(0, i) + " " + hl.substring(i + 1);
+
// clean the line: remove double-spaces
int p;
while ((p = hl.indexOf(" ")) >= 0) hl = hl.substring(0, p) + hl.substring(p + 1);
-
+
// return result
- return hl.trim();
+ return hl.trim();
}
public byte[] getText() {
- return text.getBytes();
+ return text.getBytes();
}
-
+
public Map getAnchors() {
- return anchors;
+ return anchors;
}
public Map getImages() {
- return images;
+ return images;
}
public void close() {
@@ -196,23 +215,22 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
text = null;
root = null;
}
-
+
public void print() {
- System.out.println("TITLE :" + title);
- System.out.println("HEADLINE:" + headline);
- System.out.println("ANCHORS :" + anchors.toString());
- System.out.println("IMAGES :" + images.toString());
- System.out.println("TEXT :" + new String(text.getBytes()));
+ System.out.println("TITLE :" + title);
+ System.out.println("HEADLINE:" + headline);
+ System.out.println("ANCHORS :" + anchors.toString());
+ System.out.println("IMAGES :" + images.toString());
+ System.out.println("TEXT :" + new String(text.getBytes()));
}
-
public static void main(String[] args) {
- String test = "Nokia kürzt bei Forschung und Entwicklung";
+ String test = "Nokia kürzt bei Forschung und Entwicklung";
try {
htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL("http://localhost"));
scraper.scrapeText(test.getBytes());
System.out.println(new String(scraper.getText()));
} catch (MalformedURLException e) {}
}
-
-}
+
+}
\ No newline at end of file
diff --git a/source/de/anomic/plasma/plasmaWordIndexEntry.java b/source/de/anomic/plasma/plasmaWordIndexEntry.java
index 2531b6983..3c73125f2 100644
--- a/source/de/anomic/plasma/plasmaWordIndexEntry.java
+++ b/source/de/anomic/plasma/plasmaWordIndexEntry.java
@@ -134,10 +134,10 @@ public final class plasmaWordIndexEntry {
// serverLog.logFinest("PLASMA", "docType mime=" + mime);
char doctype = DT_UNKNOWN;
if (mime == null) doctype = DT_UNKNOWN;
- else if (mime.endsWith("/gif")) doctype = DT_IMAGE;
- else if (mime.endsWith("/jpg")) doctype = DT_IMAGE;
+ else if (mime.startsWith("image/")) doctype = DT_IMAGE;
+/* else if (mime.endsWith("/gif")) doctype = DT_IMAGE;
else if (mime.endsWith("/jpeg")) doctype = DT_IMAGE;
- else if (mime.endsWith("/png")) doctype = DT_IMAGE;
+ else if (mime.endsWith("/png")) doctype = DT_IMAGE; */
else if (mime.endsWith("/html")) doctype = DT_HTML;
else if (mime.endsWith("/rtf")) doctype = DT_DOC;
else if (mime.endsWith("/pdf")) doctype = DT_PDFPS;
@@ -147,7 +147,7 @@ public final class plasmaWordIndexEntry {
else if (mime.endsWith("/mspowerpoint")) doctype = DT_DOC;
else if (mime.endsWith("/postscript")) doctype = DT_PDFPS;
else if (mime.startsWith("text/")) doctype = DT_TEXT;
- else if (mime.startsWith("image/")) doctype = DT_IMAGE;
+// else if (mime.startsWith("image/")) doctype = DT_IMAGE;
else if (mime.startsWith("audio/")) doctype = DT_AUDIO;
else if (mime.startsWith("video/")) doctype = DT_MOVIE;
//bz2 = application/x-bzip2