diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index c6f7db521..a1ebdbcc8 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -3,7 +3,10 @@ // (C) by Michael Peter Christen; mc@anomic.de // first published on http://www.anomic.de // Frankfurt, Germany, 2004 -// last major change: 18.02.2004 +// +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by @@ -48,32 +51,31 @@ import java.util.Locale; import java.util.Map; import java.util.Properties; import java.util.TreeSet; - +import de.anomic.server.logging.serverLog; import de.anomic.server.serverByteBuffer; - public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper { - // statics: for initialisation of the HTMLFilterAbstractScraper private static TreeSet linkTags0; private static TreeSet linkTags1; + private static final Collator insensitiveCollator = Collator.getInstance(Locale.US); static { - insensitiveCollator.setStrength(Collator.SECONDARY); - insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION); + insensitiveCollator.setStrength(Collator.SECONDARY); + insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION); } - + static { - linkTags0 = new TreeSet(insensitiveCollator); - linkTags0.add("img"); + linkTags0 = new TreeSet(insensitiveCollator); + linkTags0.add("img"); linkTags0.add("base"); linkTags0.add("frame"); - linkTags1 = new TreeSet(insensitiveCollator); - linkTags1.add("a"); - linkTags1.add("h1"); - linkTags1.add("title"); + linkTags1 = new TreeSet(insensitiveCollator); + linkTags1.add("a"); + linkTags1.add("h1"); + linkTags1.add("title"); } // class variables: collectors for links @@ -87,103 +89,120 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen public htmlFilterContentScraper(URL root) { // the root value here will not be used to load the resource. // it is only the reference for relative links - super(linkTags0, linkTags1); - this.root = root; - this.anchors = new HashMap(); - this.images = new HashMap(); - this.title = ""; - this.headline = ""; - this.text = new serverByteBuffer(1024); + super(linkTags0, linkTags1); + this.root = root; + this.anchors = new HashMap(); + this.images = new HashMap(); + this.title = ""; + this.headline = ""; + this.text = new serverByteBuffer(1024); } public void scrapeText(byte[] newtext) { - //System.out.println("SCRAPE: " + new String(newtext)); - if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append(32); - text.append(super.stripAll(new serverByteBuffer(newtext, newtext.length + 1)).trim()).append(32); +// System.out.println("SCRAPE: " + new String(newtext)); + if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append(32); + text.append(super.stripAll(new serverByteBuffer(newtext, newtext.length + 1)).trim()).append(32); } public static String urlNormalform(URL url) { if (url == null) return null; return urlNormalform(url.toString()); } - + public static String urlNormalform(String us) { - if (us == null) return null; - if (us.length() == 0) return null; - + serverLog.logFiner("htmlFilter", "urlNormalform: IN=" + us); + if (us == null) { return null; } + if (us.length() == 0) { return null; } + /* TODO: what about * - case insensitive domain names * - chars that should be escaped in URLs */ - int p; - + // cutting of everything behind # - if ((p = us.indexOf("#")) >= 0) us = us.substring(0, p); - - if (us.startsWith("https")) { - if (us.endsWith(":443")) us = us.substring(0, us.length() - 4); - p = us.indexOf(":443/"); - if (p >= 0) us = us.substring(0,p).concat(us.substring(p + 4)); - } else if (us.startsWith("http")) { - if (us.endsWith(":80")) us = us.substring(0, us.length() - 3); - p = us.indexOf(":80/"); - if (p >= 0) us = us.substring(0,p).concat(us.substring(p + 3)); - } + int cpos = us.indexOf("#"); + if (cpos >= 0) { us = us.substring(0, cpos); } + + if (us.startsWith("http")) { + if (us.endsWith(":80")) { + us = us.substring(0, us.length() - 3); + serverLog.logFinest("htmlFilter", "urlNormalform: :80=" + us); + } else { + cpos = us.indexOf(":80/"); + if (cpos >= 0) { + us = us.substring(0, cpos).concat(us.substring(cpos + 3)); + serverLog.logFinest("htmlFilter", "urlNormalform: :80/=" + us); + } + } + } else if (us.startsWith("https")) { + if (us.endsWith(":443")) { + us = us.substring(0, us.length() - 4); + serverLog.logFinest("htmlFilter", "urlNormalform: :443=" + us); + } else { + cpos = us.indexOf(":443/"); + if (cpos >= 0) { + us = us.substring(0, cpos).concat(us.substring(cpos + 4)); + serverLog.logFinest("htmlFilter", "urlNormalform: :443/=" + us); + } + } + } if (((us.endsWith("/")) && (us.lastIndexOf('/', us.length() - 2) < 8))) us = us.substring(0, us.length() - 1); + serverLog.logFiner("htmlFilter", "urlNormalform: OUT=" + us); return us; - } - + } + private String absolutePath(String relativePath) { - try { - return urlNormalform(new URL(root, relativePath)); - } catch (Exception e) { - return ""; - } + try { + return urlNormalform(new URL(root, relativePath)); + } catch (Exception e) { + return ""; + } } public void scrapeTag0(String tagname, Properties tagopts) { - if (tagname.equalsIgnoreCase("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt","")); + if (tagname.equalsIgnoreCase("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt","")); if (tagname.equalsIgnoreCase("base")) try {root = new URL(tagopts.getProperty("href", ""));} catch (MalformedURLException e) {} if (tagname.equalsIgnoreCase("frame")) anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name","")); } public void scrapeTag1(String tagname, Properties tagopts, byte[] text) { - //System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text)); - if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) anchors.put(absolutePath(tagopts.getProperty("href", "")), super.stripAll(new serverByteBuffer(text)).trim().toString()); - if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) headline = super.stripAll(new serverByteBuffer(text)).toString(); - if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = super.stripAll(new serverByteBuffer(text)).toString(); +// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text)); + if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) anchors.put(absolutePath(tagopts.getProperty("href", "")), super.stripAll(new serverByteBuffer(text)).trim().toString()); + if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) headline = super.stripAll(new serverByteBuffer(text)).toString(); + if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = super.stripAll(new serverByteBuffer(text)).toString(); } public String getHeadline() { - String hl = ""; + String hl = ""; // extract headline from content - if (title.length() > 0) hl = title.trim(); - else if (headline.length() > 0) hl = headline.trim(); - else if (text.length() > 80) hl = new String(text.getBytes(), 0, 80).trim(); - else hl = text.trim().toString(); + if (title.length() > 0) hl = title.trim(); + else if (headline.length() > 0) hl = headline.trim(); + else if (text.length() > 80) hl = new String(text.getBytes(), 0, 80).trim(); + else hl = text.trim().toString(); // clean the line: may contain too many funny symbols for (int i = 0; i < hl.length(); i++) if (hl.charAt(i) < ' ') hl = hl.substring(0, i) + " " + hl.substring(i + 1); + // clean the line: remove double-spaces int p; while ((p = hl.indexOf(" ")) >= 0) hl = hl.substring(0, p) + hl.substring(p + 1); - + // return result - return hl.trim(); + return hl.trim(); } public byte[] getText() { - return text.getBytes(); + return text.getBytes(); } - + public Map getAnchors() { - return anchors; + return anchors; } public Map getImages() { - return images; + return images; } public void close() { @@ -196,23 +215,22 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen text = null; root = null; } - + public void print() { - System.out.println("TITLE :" + title); - System.out.println("HEADLINE:" + headline); - System.out.println("ANCHORS :" + anchors.toString()); - System.out.println("IMAGES :" + images.toString()); - System.out.println("TEXT :" + new String(text.getBytes())); + System.out.println("TITLE :" + title); + System.out.println("HEADLINE:" + headline); + System.out.println("ANCHORS :" + anchors.toString()); + System.out.println("IMAGES :" + images.toString()); + System.out.println("TEXT :" + new String(text.getBytes())); } - public static void main(String[] args) { - String test = "Nokia kürzt bei Forschung und Entwicklung"; + String test = "Nokia kürzt bei Forschung und Entwicklung"; try { htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL("http://localhost")); scraper.scrapeText(test.getBytes()); System.out.println(new String(scraper.getText())); } catch (MalformedURLException e) {} } - -} + +} \ No newline at end of file diff --git a/source/de/anomic/plasma/plasmaWordIndexEntry.java b/source/de/anomic/plasma/plasmaWordIndexEntry.java index 2531b6983..3c73125f2 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntry.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntry.java @@ -134,10 +134,10 @@ public final class plasmaWordIndexEntry { // serverLog.logFinest("PLASMA", "docType mime=" + mime); char doctype = DT_UNKNOWN; if (mime == null) doctype = DT_UNKNOWN; - else if (mime.endsWith("/gif")) doctype = DT_IMAGE; - else if (mime.endsWith("/jpg")) doctype = DT_IMAGE; + else if (mime.startsWith("image/")) doctype = DT_IMAGE; +/* else if (mime.endsWith("/gif")) doctype = DT_IMAGE; else if (mime.endsWith("/jpeg")) doctype = DT_IMAGE; - else if (mime.endsWith("/png")) doctype = DT_IMAGE; + else if (mime.endsWith("/png")) doctype = DT_IMAGE; */ else if (mime.endsWith("/html")) doctype = DT_HTML; else if (mime.endsWith("/rtf")) doctype = DT_DOC; else if (mime.endsWith("/pdf")) doctype = DT_PDFPS; @@ -147,7 +147,7 @@ public final class plasmaWordIndexEntry { else if (mime.endsWith("/mspowerpoint")) doctype = DT_DOC; else if (mime.endsWith("/postscript")) doctype = DT_PDFPS; else if (mime.startsWith("text/")) doctype = DT_TEXT; - else if (mime.startsWith("image/")) doctype = DT_IMAGE; +// else if (mime.startsWith("image/")) doctype = DT_IMAGE; else if (mime.startsWith("audio/")) doctype = DT_AUDIO; else if (mime.startsWith("video/")) doctype = DT_MOVIE; //bz2 = application/x-bzip2