small change

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1027 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
borg-0300 20 years ago
parent 00ab4d8723
commit 544e4ea90e

@ -3,7 +3,10 @@
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
// last major change: 18.02.2004
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
@ -48,16 +51,15 @@ import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.TreeSet;
import de.anomic.server.logging.serverLog;
import de.anomic.server.serverByteBuffer;
public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper {
// statics: for initialisation of the HTMLFilterAbstractScraper
private static TreeSet linkTags0;
private static TreeSet linkTags1;
private static final Collator insensitiveCollator = Collator.getInstance(Locale.US);
static {
insensitiveCollator.setStrength(Collator.SECONDARY);
@ -97,7 +99,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
}
public void scrapeText(byte[] newtext) {
//System.out.println("SCRAPE: " + new String(newtext));
// System.out.println("SCRAPE: " + new String(newtext));
if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append(32);
text.append(super.stripAll(new serverByteBuffer(newtext, newtext.length + 1)).trim()).append(32);
}
@ -108,28 +110,44 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
}
public static String urlNormalform(String us) {
if (us == null) return null;
if (us.length() == 0) return null;
serverLog.logFiner("htmlFilter", "urlNormalform: IN=" + us);
if (us == null) { return null; }
if (us.length() == 0) { return null; }
/* TODO: what about
* - case insensitive domain names
* - chars that should be escaped in URLs
*/
int p;
// cutting of everything behind #
if ((p = us.indexOf("#")) >= 0) us = us.substring(0, p);
int cpos = us.indexOf("#");
if (cpos >= 0) { us = us.substring(0, cpos); }
if (us.startsWith("https")) {
if (us.endsWith(":443")) us = us.substring(0, us.length() - 4);
p = us.indexOf(":443/");
if (p >= 0) us = us.substring(0,p).concat(us.substring(p + 4));
} else if (us.startsWith("http")) {
if (us.endsWith(":80")) us = us.substring(0, us.length() - 3);
p = us.indexOf(":80/");
if (p >= 0) us = us.substring(0,p).concat(us.substring(p + 3));
if (us.startsWith("http")) {
if (us.endsWith(":80")) {
us = us.substring(0, us.length() - 3);
serverLog.logFinest("htmlFilter", "urlNormalform: :80=" + us);
} else {
cpos = us.indexOf(":80/");
if (cpos >= 0) {
us = us.substring(0, cpos).concat(us.substring(cpos + 3));
serverLog.logFinest("htmlFilter", "urlNormalform: :80/=" + us);
}
}
} else if (us.startsWith("https")) {
if (us.endsWith(":443")) {
us = us.substring(0, us.length() - 4);
serverLog.logFinest("htmlFilter", "urlNormalform: :443=" + us);
} else {
cpos = us.indexOf(":443/");
if (cpos >= 0) {
us = us.substring(0, cpos).concat(us.substring(cpos + 4));
serverLog.logFinest("htmlFilter", "urlNormalform: :443/=" + us);
}
}
}
if (((us.endsWith("/")) && (us.lastIndexOf('/', us.length() - 2) < 8))) us = us.substring(0, us.length() - 1);
serverLog.logFiner("htmlFilter", "urlNormalform: OUT=" + us);
return us;
}
@ -148,7 +166,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
}
public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
//System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) anchors.put(absolutePath(tagopts.getProperty("href", "")), super.stripAll(new serverByteBuffer(text)).trim().toString());
if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) headline = super.stripAll(new serverByteBuffer(text)).toString();
if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = super.stripAll(new serverByteBuffer(text)).toString();
@ -166,6 +184,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
// clean the line: may contain too many funny symbols
for (int i = 0; i < hl.length(); i++)
if (hl.charAt(i) < ' ') hl = hl.substring(0, i) + " " + hl.substring(i + 1);
// clean the line: remove double-spaces
int p;
while ((p = hl.indexOf(" ")) >= 0) hl = hl.substring(0, p) + hl.substring(p + 1);
@ -205,7 +224,6 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
System.out.println("TEXT :" + new String(text.getBytes()));
}
public static void main(String[] args) {
String test = "Nokia kürzt bei Forschung und Entwicklung";
try {

@ -134,10 +134,10 @@ public final class plasmaWordIndexEntry {
// serverLog.logFinest("PLASMA", "docType mime=" + mime);
char doctype = DT_UNKNOWN;
if (mime == null) doctype = DT_UNKNOWN;
else if (mime.endsWith("/gif")) doctype = DT_IMAGE;
else if (mime.endsWith("/jpg")) doctype = DT_IMAGE;
else if (mime.startsWith("image/")) doctype = DT_IMAGE;
/* else if (mime.endsWith("/gif")) doctype = DT_IMAGE;
else if (mime.endsWith("/jpeg")) doctype = DT_IMAGE;
else if (mime.endsWith("/png")) doctype = DT_IMAGE;
else if (mime.endsWith("/png")) doctype = DT_IMAGE; */
else if (mime.endsWith("/html")) doctype = DT_HTML;
else if (mime.endsWith("/rtf")) doctype = DT_DOC;
else if (mime.endsWith("/pdf")) doctype = DT_PDFPS;
@ -147,7 +147,7 @@ public final class plasmaWordIndexEntry {
else if (mime.endsWith("/mspowerpoint")) doctype = DT_DOC;
else if (mime.endsWith("/postscript")) doctype = DT_PDFPS;
else if (mime.startsWith("text/")) doctype = DT_TEXT;
else if (mime.startsWith("image/")) doctype = DT_IMAGE;
// else if (mime.startsWith("image/")) doctype = DT_IMAGE;
else if (mime.startsWith("audio/")) doctype = DT_AUDIO;
else if (mime.startsWith("video/")) doctype = DT_MOVIE;
//bz2 = application/x-bzip2

Loading…
Cancel
Save