From 958ec20cd079d0f6b25522e8d42ef8a4ed6fb9a9 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 22 Oct 2008 11:11:55 +0000 Subject: [PATCH] removed specialized umlaute-handling in html parser. This has to be replaced by something that is able to transfer all possible html encodings into utf-8. Please see SVN 5293 for test cases. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5294 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../htmlFilter/htmlFilterAbstractScraper.java | 51 ++++--------------- 1 file changed, 9 insertions(+), 42 deletions(-) diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java index a01f66527..44b2c28ac 100644 --- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java @@ -26,7 +26,6 @@ package de.anomic.htmlFilter; -import java.util.HashMap; import java.util.HashSet; import java.util.Properties; @@ -42,6 +41,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { private HashSet tags1; // define a translation table for html character codings + /* private static HashMap trans = new HashMap(300); static { trans.put(""", "\""); //Anführungszeichen oben @@ -288,7 +288,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts } - +*/ /** * create a scraper. the tag sets must contain tags in lowercase! * @param tags0 @@ -316,6 +316,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text); // string conversions + /* private static String code_iso8859s(final char c) { switch (c) { @@ -356,8 +357,10 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { default: return null; } } - + + */ public static serverCharBuffer convertUmlaute(final serverCharBuffer bb) { + return bb; /* if (bb.length() == 0) return bb; final serverCharBuffer t = new serverCharBuffer(bb.length() + 20); @@ -369,45 +372,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { else t.append(z); } return t; - - -// serverByteBuffer t = new serverByteBuffer(bb.length() + 20); -// int b0, b1, b2; -// String z; -// int i = 0; -// while (i < bb.length()) { -// b0 = bb.byteAt(i) & 0xff; -// // check utf-8 encoding -// if ((b0 < 128) || (i + 1 == bb.length())) { -// t.append(b0); -// i++; -// } else { -// b1 = bb.byteAt(i + 1) & 0xff; -// if (b1 > 0x3f) { -// z = code_iso8859s(b0); -// i++; -// } else if ((b0 > 0xbf) && (b0 < 0xe0)) { -// z = code_iso8859s(((b0 & 0x1f) << 0x6) | (b1 & 0x3f)); -// i += 2; -// } else { -// if (i + 2 >= bb.length()) { -// z = null; -// i++; -// } else { -// b2 = bb.byteAt(i + 2) & 0xff; -// if (b2 > 0x3f) { -// z = code_iso8859s(b0); -// i++; -// } else { -// z = code_iso8859s(((b0 & 0xf) << 0xc) | ((b1 & 0x3f) << 0x6) | (b2 & 0x3f)); -// i += 3; -// } -// } -// } -// if (z == null) t.append(b0); else t.append(z); -// } -// } -// return t; + */ } private static char[] transscript(final char[] code) { @@ -417,9 +382,11 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { } return new char[] {(char) Integer.parseInt((new String(code)).substring(2, code.length - 1))}; } + return new char[0]; /* final String t = trans.get(new String(code)); if (t == null) return new char[0]; return t.toCharArray(); + */ } protected static serverCharBuffer transscriptAll(serverCharBuffer bb) {