removed specialized umlaute-handling in html parser. This has to be replaced by something that is able to transfer all possible html encodings into utf-8. Please see SVN 5293 for test cases.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5294 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · 958ec20cd0
parent 204220ecd5
commit 958ec20cd0
1 changed files with 9 additions and 42 deletions
--- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
@ -26,7 +26,6 @@

 package de.anomic.htmlFilter;

-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Properties;

@ -42,6 +41,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
    private HashSet<String> tags1;

    // define a translation table for html character codings
+    /*
    private static HashMap<String, String> trans = new HashMap<String, String>(300);
    static {
        trans.put("&quot;", "\""); //Anf&uuml;hrungszeichen oben
@ -288,7 +288,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
        trans.put("&lsaquo;", ""); //angewinkeltes einzelnes Anf.zeichen links
        trans.put("&rsaquo;", ""); //angewinkeltes einzelnes Anf.zeichen rechts
    }
-
+*/
    /**
     * create a scraper. the tag sets must contain tags in lowercase!
     * @param tags0
@ -316,6 +316,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
    public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text);

    // string conversions
+    /*
    private static String code_iso8859s(final char c) {
        switch (c) {
        
@ -357,7 +358,9 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
        }
    }
    
+  */
    public static serverCharBuffer convertUmlaute(final serverCharBuffer bb) {
+        return bb; /*
        if (bb.length() == 0) return bb;

            final serverCharBuffer t = new serverCharBuffer(bb.length() + 20);
@ -369,45 +372,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
                else t.append(z);
            }
            return t;
-
-        
-//        serverByteBuffer t = new serverByteBuffer(bb.length() + 20);
-//        int b0, b1, b2;
-//        String z;
-//        int i = 0;
-//        while (i < bb.length()) {
-//            b0 = bb.byteAt(i) & 0xff;
-//            // check utf-8 encoding
-//            if ((b0 < 128) || (i + 1 == bb.length())) {
-//                t.append(b0);
-//                i++;
-//            } else {
-//                b1 = bb.byteAt(i + 1) & 0xff;
-//                if (b1 > 0x3f) {
-//                    z = code_iso8859s(b0);
-//                    i++;
-//                } else if ((b0 > 0xbf) && (b0 < 0xe0)) {
-//                    z = code_iso8859s(((b0 & 0x1f) << 0x6) | (b1 & 0x3f));
-//                    i += 2;
-//                } else {
-//                    if (i + 2 >= bb.length()) {
-//                        z = null;
-//                        i++;
-//                    } else {
-//                        b2 = bb.byteAt(i + 2) & 0xff;
-//                        if (b2 > 0x3f) {
-//                            z = code_iso8859s(b0);
-//                            i++;
-//                        } else {
-//                            z = code_iso8859s(((b0 & 0xf) << 0xc) | ((b1 & 0x3f) << 0x6) | (b2 & 0x3f));
-//                            i += 3;
-//                        }
-//                    }
-//                }
-//                if (z == null) t.append(b0); else t.append(z);
-//            }
-//        }
-//        return t;
+            */
    }

    private static char[] transscript(final char[] code) {
@ -417,9 +382,11 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
            }
            return new char[] {(char) Integer.parseInt((new String(code)).substring(2, code.length - 1))};
        }
+        return new char[0]; /*
        final String t = trans.get(new String(code)); 
        if (t == null) return new char[0];
        return t.toCharArray();
+        */
    }

    protected static serverCharBuffer transscriptAll(serverCharBuffer bb) {