From ad7f600f25bd392569757b34e247d03de3a2d148 Mon Sep 17 00:00:00 2001 From: theli Date: Mon, 18 Sep 2006 11:04:16 +0000 Subject: [PATCH] *) Bugfix. re-enabling inheritance of serverCharBuffer from writer class git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2618 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../htmlFilter/htmlFilterAbstractScraper.java | 2 +- .../htmlFilter/htmlFilterContentScraper.java | 30 ++++- .../htmlFilterContentTransformer.java | 20 ++- .../anomic/htmlFilter/htmlFilterWriter.java | 121 +++++++++++------- source/de/anomic/plasma/plasmaParser.java | 8 ++ source/de/anomic/server/serverCharBuffer.java | 10 +- 6 files changed, 128 insertions(+), 63 deletions(-) diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java index 092cf6514..708f3a14b 100644 --- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java @@ -1 +1 @@ -// htmlFilterAbstractScraper.java // --------------------------- // (C) by Michael Peter Christen; mc@anomic.de // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // last major change: 18.02.2004 // // You agree that the Author(s) is (are) not responsible for cost, // loss of data or any harm that may be caused by usage of this softare or // this documentation. The usage of this software is on your own risk. The // installation and usage (starting/running) of this software may allow other // people or application to access your computer and any attached devices and // is highly dependent on the configuration of the software which must be // done by the user of the software;the author(s) is (are) also // not responsible for proper configuration and usage of the software, even // if provoked by documentation provided together with the software. // // THE SOFTWARE THAT FOLLOWS AS ART OF PROGRAMMING BELOW THIS SECTION // IS PUBLISHED UNDER THE GPL AS DOCUMENTED IN THE FILE gpl.txt ASIDE THIS // FILE AND AS IN http://www.gnu.org/licenses/gpl.txt // ANY CHANGES TO THIS FILE ACCORDING TO THE GPL CAN BE DONE TO THE // LINES THAT FOLLOWS THIS COPYRIGHT NOTICE HERE, BUT CHANGES MUST NOT // BE DONE ABOVE OR INSIDE THE COPYRIGHT NOTICE. A RE-DISTRIBUTION // MUST CONTAIN THE INTACT AND UNCHANGED COPYRIGHT NOTICE. // CONTRIBUTIONS AND CHANGES TO THE PROGRAM CODE SHOULD BE MARKED AS SUCH. package de.anomic.htmlFilter; import java.util.HashMap; import java.util.Properties; import java.util.TreeSet; import de.anomic.server.serverCharBuffer; public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { public static final char lb = '<'; public static final char rb = '>'; public static final char sl = '/'; private TreeSet tags0; private TreeSet tags1; // define a translation table for html character codings private static HashMap trans = new HashMap(300); static { trans.put(""", "\""); //Anführungszeichen oben trans.put("&", "&"); //Ampersand-Zeichen, kaufmännisches Und trans.put("<", "<"); //öffnende spitze Klammer trans.put(">", ">"); //schließende spitze Klammer trans.put(" ", " "); //Erzwungenes Leerzeichen trans.put("¡", "!"); //umgekehrtes Ausrufezeichen trans.put("¢", " cent "); //Cent-Zeichen trans.put("£", " pound "); //Pfund-Zeichen trans.put("¤", " currency "); //Währungs-Zeichen trans.put("¥", " yen "); //Yen-Zeichen trans.put("¦", " "); //durchbrochener Strich trans.put("§", " paragraph "); //Paragraph-Zeichen trans.put("¨", " "); //Pünktchen oben trans.put("©", " copyright "); //Copyright-Zeichen trans.put("ª", " "); //Ordinal-Zeichen weiblich trans.put("«", " "); //angewinkelte Anführungszeichen links trans.put("¬", " not "); //Verneinungs-Zeichen trans.put("­", "-"); //kurzer Trennstrich trans.put("®", " trademark "); //Registriermarke-Zeichen trans.put("¯", " "); //Überstrich trans.put("°", " degree "); //Grad-Zeichen trans.put("±", " +/- "); //Plusminus-Zeichen trans.put("²", " square "); //Hoch-2-Zeichen trans.put("³", " 3 "); //Hoch-3-Zeichen trans.put("´", " "); //Acute-Zeichen trans.put("µ", " micro "); //Mikro-Zeichen trans.put("¶", " paragraph "); //Absatz-Zeichen trans.put("·", " "); //Mittelpunkt trans.put("¸", " "); //Häkchen unten trans.put("¹", " "); //Hoch-1-Zeichen trans.put("º", " degree "); //Ordinal-Zeichen männlich trans.put("»", " "); //angewinkelte Anführungszeichen rechts trans.put("¼", " quarter "); //ein Viertel trans.put("½", " half "); //ein Halb trans.put("¾", " 3/4 "); //drei Viertel trans.put("¿", "?"); //umgekehrtes Fragezeichen trans.put("À", "A"); //A mit Accent grave trans.put("Á", "A"); //A mit Accent acute trans.put("Â", "A"); //A mit Circumflex trans.put("Ã", "A"); //A mit Tilde trans.put("Ä", "Ae"); //A Umlaut trans.put("Å", "A"); //A mit Ring trans.put("Æ", "A"); //A mit legiertem E trans.put("Ç", "C"); //C mit Häkchen trans.put("È", "E"); //E mit Accent grave trans.put("É", "E"); //E mit Accent acute trans.put("Ê", "E"); //E mit Circumflex trans.put("Ë", "E"); //E Umlaut trans.put("Ì", "I"); //I mit Accent grave trans.put("Í", "I"); //I mit Accent acute trans.put("Î", "I"); //I mit Circumflex trans.put("Ï", "I"); //I Umlaut trans.put("Ð", "D"); //Eth (isländisch) trans.put("Ñ", "N"); //N mit Tilde trans.put("Ò", "O"); //O mit Accent grave trans.put("Ó", "O"); //O mit Accent acute trans.put("Ô", "O"); //O mit Circumflex trans.put("Õ", "O"); //O mit Tilde trans.put("Ö", "Oe"); //O Umlaut trans.put("×", " times "); //Mal-Zeichen trans.put("Ø", "O"); //O mit Schrägstrich trans.put("Ù", "U"); //U mit Accent grave trans.put("Ú", "U"); //U mit Accent acute trans.put("Û", "U"); //U mit Circumflex trans.put("Ü", "Ue"); //U Umlaut trans.put("Ý", "Y"); //Y mit Accent acute trans.put("Þ", "P"); //THORN (isländisch) trans.put("ß", "ss"); //scharfes S trans.put("à", "a"); //a mit Accent grave trans.put("á", "a"); //a mit Accent acute trans.put("â", "a"); //a mit Circumflex trans.put("ã", "a"); //a mit Tilde trans.put("ä", "ae"); //a Umlaut trans.put("å", "a"); //a mit Ring trans.put("æ", "a"); //a mit legiertem e trans.put("ç", "c"); //c mit Häkchen trans.put("è", "e"); //e mit Accent grave trans.put("é", "e"); //e mit Accent acute trans.put("ê", "e"); //e mit Circumflex trans.put("ë", "e"); //e Umlaut trans.put("ì", "i"); //i mit Accent grave trans.put("í", "i"); //i mit Accent acute trans.put("î", "i"); //i mit Circumflex trans.put("ï", "i"); //i Umlaut trans.put("ð", "d"); //eth (isländisch) trans.put("ñ", "n"); //n mit Tilde trans.put("ò", "o"); //o mit Accent grave trans.put("ó", "o"); //o mit Accent acute trans.put("ô", "o"); //o mit Circumflex trans.put("õ", "o"); //o mit Tilde trans.put("ö", "oe"); //o Umlaut trans.put("÷", "%"); //Divisions-Zeichen trans.put("ø", "o"); //o mit Schrägstrich trans.put("ù", "u"); //u mit Accent grave trans.put("ú", "u"); //u mit Accent acute trans.put("û", "u"); //u mit Circumflex trans.put("ü", "ue"); //u Umlaut trans.put("ý", "y"); //y mit Accent acute trans.put("þ", "p"); //thorn (isländisch) trans.put("ÿ", "y"); //y Umlaut trans.put("Α", " Alpha "); //Alpha groß trans.put("α", " alpha "); //alpha klein trans.put("Β", " Beta "); //Beta groß trans.put("β", " beta "); //beta klein trans.put("Γ", " Gamma "); //Gamma groß trans.put("γ", " gamma "); //gamma klein trans.put("Δ", " Delta "); //Delta groß trans.put("δ", " delta "); //delta klein trans.put("Ε", " Epsilon "); //Epsilon groß trans.put("ε", " epsilon "); //epsilon klein trans.put("Ζ", " Zeta "); //Zeta groß trans.put("ζ", " zeta "); //zeta klein trans.put("Η", " Eta "); //Eta groß trans.put("η", " eta "); //eta klein trans.put("Θ", " Theta "); //Theta groß trans.put("θ", " theta "); //theta klein trans.put("Ι", " Iota "); //Iota groß trans.put("ι", " iota "); //iota klein trans.put("Κ", " Kappa "); //Kappa groß trans.put("κ", " kappa "); //kappa klein trans.put("Λ", " Lambda "); //Lambda groß trans.put("λ", " lambda "); //lambda klein trans.put("Μ", " Mu "); //Mu groß trans.put("μ", " mu "); //mu klein trans.put("Ν", " Nu "); //Nu groß trans.put("ν", " nu "); //nu klein trans.put("Ξ", " Xi "); //Xi groß trans.put("ξ", " xi "); //xi klein trans.put("Ο", " Omicron "); //Omicron groß trans.put("ο", " omicron "); //omicron klein trans.put("Π", " Pi "); //Pi groß trans.put("π", " pi "); //pi klein trans.put("Ρ", " Rho "); //Rho groß trans.put("ρ", " rho "); //rho klein trans.put("Σ", " Sigma "); //Sigma groß trans.put("ς", " sigma "); //sigmaf klein trans.put("σ", " sigma "); //sigma klein trans.put("Τ", " Tau "); //Tau groß trans.put("τ", " tau "); //tau klein trans.put("Υ", " Ypsilon "); //Upsilon groß trans.put("υ", " ypsilon "); //upsilon klein trans.put("Φ", " Phi "); //Phi groß trans.put("φ", " phi "); //phi klein trans.put("Χ", " Chi "); //Chi groß trans.put("χ", " chi "); //chi klein trans.put("Ψ", " Psi "); //Psi groß trans.put("ψ", " psi "); //psi klein trans.put("Ω", " Omega "); //Omega groß trans.put("ω", " omega "); //omega klein trans.put("ϑ", " theta "); //theta Symbol trans.put("ϒ", " ypsilon "); //upsilon mit Haken trans.put("ϖ", " pi "); //pi Symbol trans.put("∀", " for all "); //für alle trans.put("∂", " part of "); //teilweise trans.put("∃", " exists "); //existiert trans.put("∅", " null "); //leer trans.put("∇", " nabla "); //nabla trans.put("∈", " element of "); //Element von trans.put("∉", " not element of "); //kein Element von trans.put("∋", " contains "); //enthält als Element trans.put("∏", " product "); //Produkt trans.put("∑", " sum "); //Summe trans.put("−", " minus "); //minus trans.put("∗", " times "); //Asterisk trans.put("√", " sqare root "); //Quadratwurzel trans.put("∝", " proportional to "); //proportional zu trans.put("∞", " unlimited "); //unendlich trans.put("∠", " angle "); //Winkel trans.put("∧", " and "); //und trans.put("∨", " or "); //oder trans.put("∩", " "); //Schnittpunkt trans.put("∪", " unity "); //Einheit trans.put("∫", " integral "); //Integral trans.put("∴", " cause "); //deshalb trans.put("∼", " similar to "); //ähnlich wie trans.put("≅", " equal "); //annähernd gleich trans.put("≈", " equal "); //beinahe gleich trans.put("≠", " not equal "); //ungleich trans.put("≡", " identical "); //identisch mit trans.put("≤", " smaller or equal than "); //kleiner gleich trans.put("≥", " greater or equal than "); //größer gleich trans.put("⊂", " subset of "); //Untermenge von trans.put("⊃", " superset of "); //Obermenge von trans.put("⊄", " not subset of "); //keine Untermenge von trans.put("⊆", ""); //Untermenge von oder gleich mit trans.put("⊇", ""); //Obermenge von oder gleich mit trans.put("⊕", ""); //Direktsumme trans.put("⊗", ""); //Vektorprodukt trans.put("⊥", ""); //senkrecht zu trans.put("⋅", ""); //Punkt-Operator trans.put("◊", ""); //Raute trans.put("⌈", ""); //links oben trans.put("⌉", ""); //rechts oben trans.put("⌊", ""); //links unten trans.put("⌋", ""); //rechts unten trans.put("⟨", ""); //spitze Klammer links trans.put("⟩", ""); //spitze Klammer rechts trans.put("←", ""); //Pfeil links trans.put("↑", ""); //Pfeil oben trans.put("→", ""); //Pfeil rechts trans.put("↓", ""); //Pfeil unten trans.put("↔", ""); //Pfeil links/rechts trans.put("↵", ""); //Pfeil unten-Knick-links trans.put("⇐", ""); //Doppelpfeil links trans.put("⇑", ""); //Doppelpfeil oben trans.put("⇒", ""); //Doppelpfeil rechts trans.put("⇓", ""); //Doppelpfeil unten trans.put("⇔", ""); //Doppelpfeil links/rechts trans.put("•", ""); //Bullet-Zeichen trans.put("…", ""); //Horizontale Ellipse trans.put("′", ""); //Minutenzeichen trans.put("‾", ""); //Überstrich trans.put("⁄", ""); //Bruchstrich trans.put("℘", ""); //Weierstrass p trans.put("ℑ", ""); //Zeichen für "imaginär" trans.put("ℜ", ""); //Zeichen für "real" trans.put("™", ""); //Trademark-Zeichen trans.put("€", ""); //Euro-Zeichen trans.put("ℵ", ""); //Alef-Symbol trans.put("♠", ""); //Pik-Zeichen trans.put("♣", ""); //Kreuz-Zeichen trans.put("♥", ""); //Herz-Zeichen trans.put("♦", ""); //Karo-Zeichen trans.put(" ", ""); //Leerzeichen Breite n trans.put(" ", ""); //Leerzeichen Breite m trans.put(" ", ""); //Schmales Leerzeichen trans.put("‌", ""); //null breiter Nichtverbinder trans.put("‍", ""); //null breiter Verbinder trans.put("‎", ""); //links-nach-rechts-Zeichen trans.put("‏", ""); //rechts-nach-links-Zeichen trans.put("–", ""); //Gedankenstrich Breite n trans.put("—", ""); //Gedankenstrich Breite m trans.put("‘", ""); //einfaches Anführungszeichen links trans.put("’", ""); //einfaches Anführungszeichen rechts trans.put("‚", ""); //einfaches low-9-Zeichen trans.put("“", ""); //doppeltes Anführungszeichen links trans.put("”", ""); //doppeltes Anführungszeichen rechts trans.put("„", ""); //doppeltes low-9-Zeichen rechts trans.put("†", ""); //Kreuz trans.put("‡", ""); //Doppelkreuz trans.put("‰", ""); //zu tausend trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts } public htmlFilterAbstractScraper(TreeSet tags0, TreeSet tags1) { this.tags0 = tags0; this.tags1 = tags1; } public boolean isTag0(String tag) { return (tags0 != null) && (tags0.contains(tag)); } public boolean isTag1(String tag) { return (tags1 != null) && (tags1.contains(tag)); } //the 'missing' method that shall be implemented: public abstract void scrapeText(char[] text); // the other methods must take into account to construct the return value correctly public abstract void scrapeTag0(String tagname, Properties tagopts); public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text); // string conversions private static String code_iso8859s(char c) { switch (c) { // german umlaute and ligaturen case 0xc4: return "AE"; case 0xd6: return "OE"; case 0xdc: return "UE"; case 0xe4: return "ae"; case 0xf6: return "oe"; case 0xfc: return "ue"; case 0xdf: return "ss"; // accent on letters; i.e. french characters case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc5: return "A"; case 0xc6: return "AE"; case 0xc7: return "C"; case 0xc8: case 0xc9: case 0xca: return "E"; case 0xcc: case 0xcd: case 0xce: case 0xcf: return "I"; case 0xd0: return "D"; case 0xd1: return "N"; case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd8: return "O"; case 0xd7: return "x"; case 0xd9: case 0xda: case 0xdb: return "U"; case 0xdd: return "Y"; case 0xde: return "p"; case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe5: return "a"; case 0xe6: return "ae"; case 0xe7: return "c"; case 0xe8: case 0xe9: case 0xea: return "e"; case 0xec: case 0xed: case 0xee: case 0xef: return "i"; case 0xf0: return "d"; case 0xf1: return "n"; case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf8: return "o"; case 0xf7: return "%"; case 0xf9: case 0xfa: case 0xfb: return "u"; case 0xfd: case 0xff: return "y"; case 0xfe: return "p"; // special characters case 0xa4: return " euro "; default: return null; } } public static serverCharBuffer convertUmlaute(serverCharBuffer bb) { if (bb.length() == 0) return bb; serverCharBuffer t = new serverCharBuffer(bb.length() + 20); char c; for (int i = 0; i < bb.length(); i++) { c = bb.charAt(i); String z = code_iso8859s(c); if (z == null) t.append(c); else t.append(z); } return t; // serverByteBuffer t = new serverByteBuffer(bb.length() + 20); // int b0, b1, b2; // String z; // int i = 0; // while (i < bb.length()) { // b0 = bb.byteAt(i) & 0xff; // // check utf-8 encoding // if ((b0 < 128) || (i + 1 == bb.length())) { // t.append(b0); // i++; // } else { // b1 = bb.byteAt(i + 1) & 0xff; // if (b1 > 0x3f) { // z = code_iso8859s(b0); // i++; // } else if ((b0 > 0xbf) && (b0 < 0xe0)) { // z = code_iso8859s(((b0 & 0x1f) << 0x6) | (b1 & 0x3f)); // i += 2; // } else { // if (i + 2 >= bb.length()) { // z = null; // i++; // } else { // b2 = bb.byteAt(i + 2) & 0xff; // if (b2 > 0x3f) { // z = code_iso8859s(b0); // i++; // } else { // z = code_iso8859s(((b0 & 0xf) << 0xc) | ((b1 & 0x3f) << 0x6) | (b2 & 0x3f)); // i += 3; // } // } // } // if (z == null) t.append(b0); else t.append(z); // } // } // return t; } private static char[] transscript(char[] code) { String t = (String) trans.get(new String(code)); if (t == null) return new char[0]; return t.toCharArray(); } protected static serverCharBuffer transscriptAll(serverCharBuffer bb) { int p0 = 0, p1; char[] t; while ((p0 = bb.indexOf('&', p0)) >= 0) { p1 = bb.indexOf(';', p0); if (p1 >= 0) { t = transscript(bb.getChars(p0, p1 + 1)); bb = new serverCharBuffer(bb.getChars(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getChars(p1 + 1)); } else { bb = new serverCharBuffer(bb.getChars(0, p0), bb.length()).append(bb.getChars(p0 + 1)); } } t = null; return bb; } protected static serverCharBuffer stripAllTags(serverCharBuffer bb) { int p0 = 0, p1; while ((p0 = bb.indexOf(lb, p0)) >= 0) { p1 = bb.indexOf(rb, p0); if (p1 >= 0) { bb = ((serverCharBuffer)new serverCharBuffer(bb.getChars(0, p0), bb.length() + p0 - p1 + 1).trim().append(32)).append(new serverCharBuffer(bb.getChars(p1 + 1)).trim()); } else { bb = new serverCharBuffer(bb.getChars(0, p0), bb.length()).trim().append(new serverCharBuffer(bb.getChars(p0 + 1)).trim()); } } return bb.trim(); } public static serverCharBuffer stripAll(serverCharBuffer bb) { //return stripAllTags(s); return convertUmlaute(transscriptAll(stripAllTags(bb))); } public void close() { // free resources tags0 = null; tags1 = null; } public void finalize() { close(); } } \ No newline at end of file +// htmlFilterAbstractScraper.java // --------------------------- // (C) by Michael Peter Christen; mc@anomic.de // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // last major change: 18.02.2004 // // You agree that the Author(s) is (are) not responsible for cost, // loss of data or any harm that may be caused by usage of this softare or // this documentation. The usage of this software is on your own risk. The // installation and usage (starting/running) of this software may allow other // people or application to access your computer and any attached devices and // is highly dependent on the configuration of the software which must be // done by the user of the software;the author(s) is (are) also // not responsible for proper configuration and usage of the software, even // if provoked by documentation provided together with the software. // // THE SOFTWARE THAT FOLLOWS AS ART OF PROGRAMMING BELOW THIS SECTION // IS PUBLISHED UNDER THE GPL AS DOCUMENTED IN THE FILE gpl.txt ASIDE THIS // FILE AND AS IN http://www.gnu.org/licenses/gpl.txt // ANY CHANGES TO THIS FILE ACCORDING TO THE GPL CAN BE DONE TO THE // LINES THAT FOLLOWS THIS COPYRIGHT NOTICE HERE, BUT CHANGES MUST NOT // BE DONE ABOVE OR INSIDE THE COPYRIGHT NOTICE. A RE-DISTRIBUTION // MUST CONTAIN THE INTACT AND UNCHANGED COPYRIGHT NOTICE. // CONTRIBUTIONS AND CHANGES TO THE PROGRAM CODE SHOULD BE MARKED AS SUCH. package de.anomic.htmlFilter; import java.io.IOException; import java.util.HashMap; import java.util.Properties; import java.util.TreeSet; import de.anomic.server.serverCharBuffer; public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { public static final char lb = '<'; public static final char rb = '>'; public static final char sl = '/'; private TreeSet tags0; private TreeSet tags1; // define a translation table for html character codings private static HashMap trans = new HashMap(300); static { trans.put(""", "\""); //Anführungszeichen oben trans.put("&", "&"); //Ampersand-Zeichen, kaufmännisches Und trans.put("<", "<"); //öffnende spitze Klammer trans.put(">", ">"); //schließende spitze Klammer trans.put(" ", " "); //Erzwungenes Leerzeichen trans.put("¡", "!"); //umgekehrtes Ausrufezeichen trans.put("¢", " cent "); //Cent-Zeichen trans.put("£", " pound "); //Pfund-Zeichen trans.put("¤", " currency "); //Währungs-Zeichen trans.put("¥", " yen "); //Yen-Zeichen trans.put("¦", " "); //durchbrochener Strich trans.put("§", " paragraph "); //Paragraph-Zeichen trans.put("¨", " "); //Pünktchen oben trans.put("©", " copyright "); //Copyright-Zeichen trans.put("ª", " "); //Ordinal-Zeichen weiblich trans.put("«", " "); //angewinkelte Anführungszeichen links trans.put("¬", " not "); //Verneinungs-Zeichen trans.put("­", "-"); //kurzer Trennstrich trans.put("®", " trademark "); //Registriermarke-Zeichen trans.put("¯", " "); //Überstrich trans.put("°", " degree "); //Grad-Zeichen trans.put("±", " +/- "); //Plusminus-Zeichen trans.put("²", " square "); //Hoch-2-Zeichen trans.put("³", " 3 "); //Hoch-3-Zeichen trans.put("´", " "); //Acute-Zeichen trans.put("µ", " micro "); //Mikro-Zeichen trans.put("¶", " paragraph "); //Absatz-Zeichen trans.put("·", " "); //Mittelpunkt trans.put("¸", " "); //Häkchen unten trans.put("¹", " "); //Hoch-1-Zeichen trans.put("º", " degree "); //Ordinal-Zeichen männlich trans.put("»", " "); //angewinkelte Anführungszeichen rechts trans.put("¼", " quarter "); //ein Viertel trans.put("½", " half "); //ein Halb trans.put("¾", " 3/4 "); //drei Viertel trans.put("¿", "?"); //umgekehrtes Fragezeichen trans.put("À", "A"); //A mit Accent grave trans.put("Á", "A"); //A mit Accent acute trans.put("Â", "A"); //A mit Circumflex trans.put("Ã", "A"); //A mit Tilde trans.put("Ä", "Ae"); //A Umlaut trans.put("Å", "A"); //A mit Ring trans.put("Æ", "A"); //A mit legiertem E trans.put("Ç", "C"); //C mit Häkchen trans.put("È", "E"); //E mit Accent grave trans.put("É", "E"); //E mit Accent acute trans.put("Ê", "E"); //E mit Circumflex trans.put("Ë", "E"); //E Umlaut trans.put("Ì", "I"); //I mit Accent grave trans.put("Í", "I"); //I mit Accent acute trans.put("Î", "I"); //I mit Circumflex trans.put("Ï", "I"); //I Umlaut trans.put("Ð", "D"); //Eth (isländisch) trans.put("Ñ", "N"); //N mit Tilde trans.put("Ò", "O"); //O mit Accent grave trans.put("Ó", "O"); //O mit Accent acute trans.put("Ô", "O"); //O mit Circumflex trans.put("Õ", "O"); //O mit Tilde trans.put("Ö", "Oe"); //O Umlaut trans.put("×", " times "); //Mal-Zeichen trans.put("Ø", "O"); //O mit Schrägstrich trans.put("Ù", "U"); //U mit Accent grave trans.put("Ú", "U"); //U mit Accent acute trans.put("Û", "U"); //U mit Circumflex trans.put("Ü", "Ue"); //U Umlaut trans.put("Ý", "Y"); //Y mit Accent acute trans.put("Þ", "P"); //THORN (isländisch) trans.put("ß", "ss"); //scharfes S trans.put("à", "a"); //a mit Accent grave trans.put("á", "a"); //a mit Accent acute trans.put("â", "a"); //a mit Circumflex trans.put("ã", "a"); //a mit Tilde trans.put("ä", "ae"); //a Umlaut trans.put("å", "a"); //a mit Ring trans.put("æ", "a"); //a mit legiertem e trans.put("ç", "c"); //c mit Häkchen trans.put("è", "e"); //e mit Accent grave trans.put("é", "e"); //e mit Accent acute trans.put("ê", "e"); //e mit Circumflex trans.put("ë", "e"); //e Umlaut trans.put("ì", "i"); //i mit Accent grave trans.put("í", "i"); //i mit Accent acute trans.put("î", "i"); //i mit Circumflex trans.put("ï", "i"); //i Umlaut trans.put("ð", "d"); //eth (isländisch) trans.put("ñ", "n"); //n mit Tilde trans.put("ò", "o"); //o mit Accent grave trans.put("ó", "o"); //o mit Accent acute trans.put("ô", "o"); //o mit Circumflex trans.put("õ", "o"); //o mit Tilde trans.put("ö", "oe"); //o Umlaut trans.put("÷", "%"); //Divisions-Zeichen trans.put("ø", "o"); //o mit Schrägstrich trans.put("ù", "u"); //u mit Accent grave trans.put("ú", "u"); //u mit Accent acute trans.put("û", "u"); //u mit Circumflex trans.put("ü", "ue"); //u Umlaut trans.put("ý", "y"); //y mit Accent acute trans.put("þ", "p"); //thorn (isländisch) trans.put("ÿ", "y"); //y Umlaut trans.put("Α", " Alpha "); //Alpha groß trans.put("α", " alpha "); //alpha klein trans.put("Β", " Beta "); //Beta groß trans.put("β", " beta "); //beta klein trans.put("Γ", " Gamma "); //Gamma groß trans.put("γ", " gamma "); //gamma klein trans.put("Δ", " Delta "); //Delta groß trans.put("δ", " delta "); //delta klein trans.put("Ε", " Epsilon "); //Epsilon groß trans.put("ε", " epsilon "); //epsilon klein trans.put("Ζ", " Zeta "); //Zeta groß trans.put("ζ", " zeta "); //zeta klein trans.put("Η", " Eta "); //Eta groß trans.put("η", " eta "); //eta klein trans.put("Θ", " Theta "); //Theta groß trans.put("θ", " theta "); //theta klein trans.put("Ι", " Iota "); //Iota groß trans.put("ι", " iota "); //iota klein trans.put("Κ", " Kappa "); //Kappa groß trans.put("κ", " kappa "); //kappa klein trans.put("Λ", " Lambda "); //Lambda groß trans.put("λ", " lambda "); //lambda klein trans.put("Μ", " Mu "); //Mu groß trans.put("μ", " mu "); //mu klein trans.put("Ν", " Nu "); //Nu groß trans.put("ν", " nu "); //nu klein trans.put("Ξ", " Xi "); //Xi groß trans.put("ξ", " xi "); //xi klein trans.put("Ο", " Omicron "); //Omicron groß trans.put("ο", " omicron "); //omicron klein trans.put("Π", " Pi "); //Pi groß trans.put("π", " pi "); //pi klein trans.put("Ρ", " Rho "); //Rho groß trans.put("ρ", " rho "); //rho klein trans.put("Σ", " Sigma "); //Sigma groß trans.put("ς", " sigma "); //sigmaf klein trans.put("σ", " sigma "); //sigma klein trans.put("Τ", " Tau "); //Tau groß trans.put("τ", " tau "); //tau klein trans.put("Υ", " Ypsilon "); //Upsilon groß trans.put("υ", " ypsilon "); //upsilon klein trans.put("Φ", " Phi "); //Phi groß trans.put("φ", " phi "); //phi klein trans.put("Χ", " Chi "); //Chi groß trans.put("χ", " chi "); //chi klein trans.put("Ψ", " Psi "); //Psi groß trans.put("ψ", " psi "); //psi klein trans.put("Ω", " Omega "); //Omega groß trans.put("ω", " omega "); //omega klein trans.put("ϑ", " theta "); //theta Symbol trans.put("ϒ", " ypsilon "); //upsilon mit Haken trans.put("ϖ", " pi "); //pi Symbol trans.put("∀", " for all "); //für alle trans.put("∂", " part of "); //teilweise trans.put("∃", " exists "); //existiert trans.put("∅", " null "); //leer trans.put("∇", " nabla "); //nabla trans.put("∈", " element of "); //Element von trans.put("∉", " not element of "); //kein Element von trans.put("∋", " contains "); //enthält als Element trans.put("∏", " product "); //Produkt trans.put("∑", " sum "); //Summe trans.put("−", " minus "); //minus trans.put("∗", " times "); //Asterisk trans.put("√", " sqare root "); //Quadratwurzel trans.put("∝", " proportional to "); //proportional zu trans.put("∞", " unlimited "); //unendlich trans.put("∠", " angle "); //Winkel trans.put("∧", " and "); //und trans.put("∨", " or "); //oder trans.put("∩", " "); //Schnittpunkt trans.put("∪", " unity "); //Einheit trans.put("∫", " integral "); //Integral trans.put("∴", " cause "); //deshalb trans.put("∼", " similar to "); //ähnlich wie trans.put("≅", " equal "); //annähernd gleich trans.put("≈", " equal "); //beinahe gleich trans.put("≠", " not equal "); //ungleich trans.put("≡", " identical "); //identisch mit trans.put("≤", " smaller or equal than "); //kleiner gleich trans.put("≥", " greater or equal than "); //größer gleich trans.put("⊂", " subset of "); //Untermenge von trans.put("⊃", " superset of "); //Obermenge von trans.put("⊄", " not subset of "); //keine Untermenge von trans.put("⊆", ""); //Untermenge von oder gleich mit trans.put("⊇", ""); //Obermenge von oder gleich mit trans.put("⊕", ""); //Direktsumme trans.put("⊗", ""); //Vektorprodukt trans.put("⊥", ""); //senkrecht zu trans.put("⋅", ""); //Punkt-Operator trans.put("◊", ""); //Raute trans.put("⌈", ""); //links oben trans.put("⌉", ""); //rechts oben trans.put("⌊", ""); //links unten trans.put("⌋", ""); //rechts unten trans.put("⟨", ""); //spitze Klammer links trans.put("⟩", ""); //spitze Klammer rechts trans.put("←", ""); //Pfeil links trans.put("↑", ""); //Pfeil oben trans.put("→", ""); //Pfeil rechts trans.put("↓", ""); //Pfeil unten trans.put("↔", ""); //Pfeil links/rechts trans.put("↵", ""); //Pfeil unten-Knick-links trans.put("⇐", ""); //Doppelpfeil links trans.put("⇑", ""); //Doppelpfeil oben trans.put("⇒", ""); //Doppelpfeil rechts trans.put("⇓", ""); //Doppelpfeil unten trans.put("⇔", ""); //Doppelpfeil links/rechts trans.put("•", ""); //Bullet-Zeichen trans.put("…", ""); //Horizontale Ellipse trans.put("′", ""); //Minutenzeichen trans.put("‾", ""); //Überstrich trans.put("⁄", ""); //Bruchstrich trans.put("℘", ""); //Weierstrass p trans.put("ℑ", ""); //Zeichen für "imaginär" trans.put("ℜ", ""); //Zeichen für "real" trans.put("™", ""); //Trademark-Zeichen trans.put("€", ""); //Euro-Zeichen trans.put("ℵ", ""); //Alef-Symbol trans.put("♠", ""); //Pik-Zeichen trans.put("♣", ""); //Kreuz-Zeichen trans.put("♥", ""); //Herz-Zeichen trans.put("♦", ""); //Karo-Zeichen trans.put(" ", ""); //Leerzeichen Breite n trans.put(" ", ""); //Leerzeichen Breite m trans.put(" ", ""); //Schmales Leerzeichen trans.put("‌", ""); //null breiter Nichtverbinder trans.put("‍", ""); //null breiter Verbinder trans.put("‎", ""); //links-nach-rechts-Zeichen trans.put("‏", ""); //rechts-nach-links-Zeichen trans.put("–", ""); //Gedankenstrich Breite n trans.put("—", ""); //Gedankenstrich Breite m trans.put("‘", ""); //einfaches Anführungszeichen links trans.put("’", ""); //einfaches Anführungszeichen rechts trans.put("‚", ""); //einfaches low-9-Zeichen trans.put("“", ""); //doppeltes Anführungszeichen links trans.put("”", ""); //doppeltes Anführungszeichen rechts trans.put("„", ""); //doppeltes low-9-Zeichen rechts trans.put("†", ""); //Kreuz trans.put("‡", ""); //Doppelkreuz trans.put("‰", ""); //zu tausend trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts } public htmlFilterAbstractScraper(TreeSet tags0, TreeSet tags1) { this.tags0 = tags0; this.tags1 = tags1; } public boolean isTag0(String tag) { return (tags0 != null) && (tags0.contains(tag)); } public boolean isTag1(String tag) { return (tags1 != null) && (tags1.contains(tag)); } //the 'missing' method that shall be implemented: public abstract void scrapeText(char[] text); // the other methods must take into account to construct the return value correctly public abstract void scrapeTag0(String tagname, Properties tagopts); public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text); // string conversions private static String code_iso8859s(char c) { switch (c) { // german umlaute and ligaturen case 0xc4: return "AE"; case 0xd6: return "OE"; case 0xdc: return "UE"; case 0xe4: return "ae"; case 0xf6: return "oe"; case 0xfc: return "ue"; case 0xdf: return "ss"; // accent on letters; i.e. french characters case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc5: return "A"; case 0xc6: return "AE"; case 0xc7: return "C"; case 0xc8: case 0xc9: case 0xca: return "E"; case 0xcc: case 0xcd: case 0xce: case 0xcf: return "I"; case 0xd0: return "D"; case 0xd1: return "N"; case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd8: return "O"; case 0xd7: return "x"; case 0xd9: case 0xda: case 0xdb: return "U"; case 0xdd: return "Y"; case 0xde: return "p"; case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe5: return "a"; case 0xe6: return "ae"; case 0xe7: return "c"; case 0xe8: case 0xe9: case 0xea: return "e"; case 0xec: case 0xed: case 0xee: case 0xef: return "i"; case 0xf0: return "d"; case 0xf1: return "n"; case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf8: return "o"; case 0xf7: return "%"; case 0xf9: case 0xfa: case 0xfb: return "u"; case 0xfd: case 0xff: return "y"; case 0xfe: return "p"; // special characters case 0xa4: return " euro "; default: return null; } } public static serverCharBuffer convertUmlaute(serverCharBuffer bb) { if (bb.length() == 0) return bb; try { serverCharBuffer t = new serverCharBuffer(bb.length() + 20); char c; for (int i = 0; i < bb.length(); i++) { c = bb.charAt(i); String z = code_iso8859s(c); if (z == null) t.append(c); else t.append(z); } return t; } catch (IOException e) { // ignore this return null; } // serverByteBuffer t = new serverByteBuffer(bb.length() + 20); // int b0, b1, b2; // String z; // int i = 0; // while (i < bb.length()) { // b0 = bb.byteAt(i) & 0xff; // // check utf-8 encoding // if ((b0 < 128) || (i + 1 == bb.length())) { // t.append(b0); // i++; // } else { // b1 = bb.byteAt(i + 1) & 0xff; // if (b1 > 0x3f) { // z = code_iso8859s(b0); // i++; // } else if ((b0 > 0xbf) && (b0 < 0xe0)) { // z = code_iso8859s(((b0 & 0x1f) << 0x6) | (b1 & 0x3f)); // i += 2; // } else { // if (i + 2 >= bb.length()) { // z = null; // i++; // } else { // b2 = bb.byteAt(i + 2) & 0xff; // if (b2 > 0x3f) { // z = code_iso8859s(b0); // i++; // } else { // z = code_iso8859s(((b0 & 0xf) << 0xc) | ((b1 & 0x3f) << 0x6) | (b2 & 0x3f)); // i += 3; // } // } // } // if (z == null) t.append(b0); else t.append(z); // } // } // return t; } private static char[] transscript(char[] code) { String t = (String) trans.get(new String(code)); if (t == null) return new char[0]; return t.toCharArray(); } protected static serverCharBuffer transscriptAll(serverCharBuffer bb) { int p0 = 0, p1; char[] t; while ((p0 = bb.indexOf('&', p0)) >= 0) { p1 = bb.indexOf(';', p0); if (p1 >= 0) { t = transscript(bb.getChars(p0, p1 + 1)); bb = new serverCharBuffer(bb.getChars(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getChars(p1 + 1)); } else { bb = new serverCharBuffer(bb.getChars(0, p0), bb.length()).append(bb.getChars(p0 + 1)); } } t = null; return bb; } protected static serverCharBuffer stripAllTags(serverCharBuffer bb) { int p0 = 0, p1; while ((p0 = bb.indexOf(lb, p0)) >= 0) { p1 = bb.indexOf(rb, p0); if (p1 >= 0) { bb = ((serverCharBuffer)new serverCharBuffer(bb.getChars(0, p0), bb.length() + p0 - p1 + 1).trim().append(32)).append(new serverCharBuffer(bb.getChars(p1 + 1)).trim()); } else { bb = new serverCharBuffer(bb.getChars(0, p0), bb.length()).trim().append(new serverCharBuffer(bb.getChars(p0 + 1)).trim()); } } return bb.trim(); } public static serverCharBuffer stripAll(serverCharBuffer bb) { //return stripAllTags(s); return convertUmlaute(transscriptAll(stripAllTags(bb))); } public void close() { // free resources tags0 = null; tags1 = null; } public void finalize() { close(); } } \ No newline at end of file diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index b41d97ca6..2252f3844 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -43,6 +43,9 @@ package de.anomic.htmlFilter; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.nio.charset.Charset; @@ -57,7 +60,9 @@ import java.util.Properties; import java.util.TreeSet; import de.anomic.net.URL; +import de.anomic.server.serverByteBuffer; import de.anomic.server.serverCharBuffer; +import de.anomic.server.serverFileUtils; public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper { @@ -113,6 +118,23 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen this.content = new serverCharBuffer(1024); } + public void transformCharset(String oldCharset, String newCharset) throws UnsupportedEncodingException { +// // convert the content back to the old bytearray +// ByteArrayInputStream temp = new ByteArrayInputStream(new String(this.content.getChars()).getBytes(oldCharset)); +// +// // create a reader with the new charset +// serverCharBuffer newContent = new serverCharBuffer(this.content.length()); +// try { +// InputStreamReader reader = new InputStreamReader(temp,newCharset); +// serverFileUtils.copy(reader, newContent); +// reader.close(); +// } catch (IOException e) { +// // ignore this +// } +// +// this.content = newContent; + } + public void scrapeText(char[] newtext) { // System.out.println("SCRAPE: " + new String(newtext)); if ((content.length() != 0) && (content.charAt(content.length() - 1) != 32)) content.append(32); @@ -246,10 +268,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen for (int j = 0; j < headlines[i - 1].size(); j++) s[j] = (String) headlines[i - 1].get(j); return s; } - + public byte[] getText() { + return this.getText("UTF-8"); + } + + public byte[] getText(String charSet) { try { - return content.toString().getBytes("UTF-8"); + return content.toString().getBytes(charSet); } catch (UnsupportedEncodingException e) { return content.toString().getBytes(); } diff --git a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java index d39a0b9fc..1f20f51db 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java @@ -46,6 +46,7 @@ package de.anomic.htmlFilter; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; +import java.io.IOException; import java.text.Collator; import java.util.ArrayList; import java.util.Locale; @@ -112,14 +113,19 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer } private static char[] genBlueLetters(int length) { - serverCharBuffer bb = new serverCharBuffer(" ".toCharArray()); - length = length / 2; - if (length > 10) length = 7; - while (length-- > 0) { - bb.append('X'); + try { + serverCharBuffer bb = new serverCharBuffer(" ".toCharArray()); + length = length / 2; + if (length > 10) length = 7; + while (length-- > 0) { + bb.append('X'); + } + bb.append(" "); + return bb.getChars(); + } catch (IOException e) { + // ignore this. + return null; } - bb.append(" "); - return bb.getChars(); } private boolean bluelistHit(char[] text) { diff --git a/source/de/anomic/htmlFilter/htmlFilterWriter.java b/source/de/anomic/htmlFilter/htmlFilterWriter.java index a232be49a..5426b696e 100644 --- a/source/de/anomic/htmlFilter/htmlFilterWriter.java +++ b/source/de/anomic/htmlFilter/htmlFilterWriter.java @@ -117,67 +117,92 @@ public final class htmlFilterWriter extends Writer { } public static char[] genTag0raw(String tagname, boolean opening, char[] tagopts) { - serverCharBuffer bb = new serverCharBuffer(tagname.length() + tagopts.length + 3); - bb.append('<'); - if (!opening) { - bb.append('/'); - } - bb.append(tagname); - if (tagopts.length > 0) { -// if (tagopts[0] == (byte) 32) - bb.append(tagopts); -// else bb.append((byte) 32).append(tagopts); + try { + serverCharBuffer bb = new serverCharBuffer(tagname.length() + tagopts.length + 3); + bb.append('<'); + if (!opening) { + bb.append('/'); + } + bb.append(tagname); + if (tagopts.length > 0) { +// if (tagopts[0] == (byte) 32) + bb.append(tagopts); +// else bb.append((byte) 32).append(tagopts); + } + bb.append('>'); + return bb.getChars(); + } catch (IOException e) { + // ignore this + return null; } - bb.append('>'); - return bb.getChars(); } public static char[] genTag1raw(String tagname, char[] tagopts, char[] text) { - serverCharBuffer bb = new serverCharBuffer(2 * tagname.length() + tagopts.length + text.length + 5); - bb.append('<').append(tagname); - if (tagopts.length > 0) { -// if (tagopts[0] == (byte) 32) - bb.append(tagopts); -// else bb.append((byte) 32).append(tagopts); + try { + serverCharBuffer bb = new serverCharBuffer(2 * tagname.length() + tagopts.length + text.length + 5); + bb.append('<').append(tagname); + if (tagopts.length > 0) { +// if (tagopts[0] == (byte) 32) + bb.append(tagopts); +// else bb.append((byte) 32).append(tagopts); + } + bb.append('>'); + bb.append(text); + bb.append('<').append('/').append(tagname).append('>'); + return bb.getChars(); + } catch (IOException e) { + // ignore this + return null; } - bb.append('>'); - bb.append(text); - bb.append('<').append('/').append(tagname).append('>'); - return bb.getChars(); } - public static char[] genTag0(String tagname, Properties tagopts, char quotechar) { - char[] tagoptsx = (tagopts.size() == 0) ? null : genOpts(tagopts, quotechar); - serverCharBuffer bb = new serverCharBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2); - bb.append('<').append(tagname); - if (tagoptsx != null) { - bb.append(32); - bb.append(tagoptsx); - } - bb.append('>'); - return bb.getChars(); + public static char[] genTag0(String tagname, Properties tagopts, char quotechar) { + try { + char[] tagoptsx = (tagopts.size() == 0) ? null : genOpts(tagopts, quotechar); + serverCharBuffer bb = new serverCharBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2); + bb.append('<').append(tagname); + if (tagoptsx != null) { + bb.append(32); + bb.append(tagoptsx); + } + bb.append('>'); + return bb.getChars(); + } catch (IOException e) { + // ignore this + return null; + } } - public static char[] genTag1(String tagname, Properties tagopts, char[] text, char quotechar) { - char[] gt0 = genTag0(tagname, tagopts, quotechar); - serverCharBuffer cb = new serverCharBuffer(gt0, gt0.length + text.length + tagname.length() + 3); - cb.append(text).append('<').append('/').append(tagname).append('>'); - return cb.getChars(); + public static char[] genTag1(String tagname, Properties tagopts, char[] text, char quotechar) { + try { + char[] gt0 = genTag0(tagname, tagopts, quotechar); + serverCharBuffer cb = new serverCharBuffer(gt0, gt0.length + text.length + tagname.length() + 3); + cb.append(text).append('<').append('/').append(tagname).append('>'); + return cb.getChars(); + } catch (IOException e) { + // ignore this + return null; + } } // a helper method for pretty-printing of properties for html tags - public static char[] genOpts(Properties prop, char quotechar) { - Enumeration e = prop.propertyNames(); - serverCharBuffer bb = new serverCharBuffer(prop.size() * 40); - String key; - while (e.hasMoreElements()) { - key = (String) e.nextElement(); - bb.append(32).append(key).append('=').append(quotechar); - bb.append(prop.getProperty(key)); - bb.append(quotechar); + public static char[] genOpts(Properties prop, char quotechar) { + try { + Enumeration e = prop.propertyNames(); + serverCharBuffer bb = new serverCharBuffer(prop.size() * 40); + String key; + while (e.hasMoreElements()) { + key = (String) e.nextElement(); + bb.append(32).append(key).append('=').append(quotechar); + bb.append(prop.getProperty(key)); + bb.append(quotechar); + } + if (bb.length() > 0) return bb.getChars(1); + return bb.getChars(); + }catch (IOException e) { + // ignore this + return null; } - if (bb.length() > 0) return bb.getChars(1); - return bb.getChars(); } private char[] filterTag(String tag, boolean opening, char[] content, char quotechar) { diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 02c360b38..17ca8e3f6 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -598,6 +598,14 @@ public final class plasmaParser { public plasmaParserDocument transformScraper(URL location, String mimeType, String charSet, htmlFilterContentScraper scraper) { try { + if (scraper.getMetas().containsKey("content-type")) { + String newCharset = (String) scraper.getMetas().get("content-type"); + if (!charSet.equals(newCharset)) { + // TODO: transformation of content needed + this.theLogger.logFine("Charset transformation needed from '" + charSet + "' to '" + newCharset + "'"); + } + } + String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length]; int p = 0; for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j]; diff --git a/source/de/anomic/server/serverCharBuffer.java b/source/de/anomic/server/serverCharBuffer.java index 7d9687a9d..de70bb360 100644 --- a/source/de/anomic/server/serverCharBuffer.java +++ b/source/de/anomic/server/serverCharBuffer.java @@ -50,7 +50,7 @@ import java.io.IOException; import java.io.Writer; import java.util.Properties; -public final class serverCharBuffer /* extends Writer */ { +public final class serverCharBuffer extends Writer { public static final char singlequote = '\''; public static final char doublequote = '"'; @@ -164,10 +164,10 @@ public final class serverCharBuffer /* extends Writer */ { length += le; } - public serverCharBuffer append(char b) { - write(b); - return this; - } +// public serverCharBuffer append(char b) { +// write(b); +// return this; +// } public serverCharBuffer append(int i) { write((char) (i));