diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
index 092cf6514..708f3a14b 100644
--- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
@@ -1 +1 @@
-// htmlFilterAbstractScraper.java
// ---------------------------
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
// last major change: 18.02.2004
//
// You agree that the Author(s) is (are) not responsible for cost,
// loss of data or any harm that may be caused by usage of this softare or
// this documentation. The usage of this software is on your own risk. The
// installation and usage (starting/running) of this software may allow other
// people or application to access your computer and any attached devices and
// is highly dependent on the configuration of the software which must be
// done by the user of the software;the author(s) is (are) also
// not responsible for proper configuration and usage of the software, even
// if provoked by documentation provided together with the software.
//
// THE SOFTWARE THAT FOLLOWS AS ART OF PROGRAMMING BELOW THIS SECTION
// IS PUBLISHED UNDER THE GPL AS DOCUMENTED IN THE FILE gpl.txt ASIDE THIS
// FILE AND AS IN http://www.gnu.org/licenses/gpl.txt
// ANY CHANGES TO THIS FILE ACCORDING TO THE GPL CAN BE DONE TO THE
// LINES THAT FOLLOWS THIS COPYRIGHT NOTICE HERE, BUT CHANGES MUST NOT
// BE DONE ABOVE OR INSIDE THE COPYRIGHT NOTICE. A RE-DISTRIBUTION
// MUST CONTAIN THE INTACT AND UNCHANGED COPYRIGHT NOTICE.
// CONTRIBUTIONS AND CHANGES TO THE PROGRAM CODE SHOULD BE MARKED AS SUCH.
package de.anomic.htmlFilter;
import java.util.HashMap;
import java.util.Properties;
import java.util.TreeSet;
import de.anomic.server.serverCharBuffer;
public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
public static final char lb = '<';
public static final char rb = '>';
public static final char sl = '/';
private TreeSet tags0;
private TreeSet tags1;
// define a translation table for html character codings
private static HashMap trans = new HashMap(300);
static {
trans.put(""", "\""); //Anführungszeichen oben
trans.put("&", "&"); //Ampersand-Zeichen, kaufmännisches Und
trans.put("<", "<"); //öffnende spitze Klammer
trans.put(">", ">"); //schließende spitze Klammer
trans.put(" ", " "); //Erzwungenes Leerzeichen
trans.put("¡", "!"); //umgekehrtes Ausrufezeichen
trans.put("¢", " cent "); //Cent-Zeichen
trans.put("£", " pound "); //Pfund-Zeichen
trans.put("¤", " currency "); //Währungs-Zeichen
trans.put("¥", " yen "); //Yen-Zeichen
trans.put("¦", " "); //durchbrochener Strich
trans.put("§", " paragraph "); //Paragraph-Zeichen
trans.put("¨", " "); //Pünktchen oben
trans.put("©", " copyright "); //Copyright-Zeichen
trans.put("ª", " "); //Ordinal-Zeichen weiblich
trans.put("«", " "); //angewinkelte Anführungszeichen links
trans.put("¬", " not "); //Verneinungs-Zeichen
trans.put("", "-"); //kurzer Trennstrich
trans.put("®", " trademark "); //Registriermarke-Zeichen
trans.put("¯", " "); //Überstrich
trans.put("°", " degree "); //Grad-Zeichen
trans.put("±", " +/- "); //Plusminus-Zeichen
trans.put("²", " square "); //Hoch-2-Zeichen
trans.put("³", " 3 "); //Hoch-3-Zeichen
trans.put("´", " "); //Acute-Zeichen
trans.put("µ", " micro "); //Mikro-Zeichen
trans.put("¶", " paragraph "); //Absatz-Zeichen
trans.put("·", " "); //Mittelpunkt
trans.put("¸", " "); //Häkchen unten
trans.put("¹", " "); //Hoch-1-Zeichen
trans.put("º", " degree "); //Ordinal-Zeichen männlich
trans.put("»", " "); //angewinkelte Anführungszeichen rechts
trans.put("¼", " quarter "); //ein Viertel
trans.put("½", " half "); //ein Halb
trans.put("¾", " 3/4 "); //drei Viertel
trans.put("¿", "?"); //umgekehrtes Fragezeichen
trans.put("À", "A"); //A mit Accent grave
trans.put("Á", "A"); //A mit Accent acute
trans.put("Â", "A"); //A mit Circumflex
trans.put("Ã", "A"); //A mit Tilde
trans.put("Ä", "Ae"); //A Umlaut
trans.put("Å", "A"); //A mit Ring
trans.put("Æ", "A"); //A mit legiertem E
trans.put("Ç", "C"); //C mit Häkchen
trans.put("È", "E"); //E mit Accent grave
trans.put("É", "E"); //E mit Accent acute
trans.put("Ê", "E"); //E mit Circumflex
trans.put("Ë", "E"); //E Umlaut
trans.put("Ì", "I"); //I mit Accent grave
trans.put("Í", "I"); //I mit Accent acute
trans.put("Î", "I"); //I mit Circumflex
trans.put("Ï", "I"); //I Umlaut
trans.put("Ð", "D"); //Eth (isländisch)
trans.put("Ñ", "N"); //N mit Tilde
trans.put("Ò", "O"); //O mit Accent grave
trans.put("Ó", "O"); //O mit Accent acute
trans.put("Ô", "O"); //O mit Circumflex
trans.put("Õ", "O"); //O mit Tilde
trans.put("Ö", "Oe"); //O Umlaut
trans.put("×", " times "); //Mal-Zeichen
trans.put("Ø", "O"); //O mit Schrägstrich
trans.put("Ù", "U"); //U mit Accent grave
trans.put("Ú", "U"); //U mit Accent acute
trans.put("Û", "U"); //U mit Circumflex
trans.put("Ü", "Ue"); //U Umlaut
trans.put("Ý", "Y"); //Y mit Accent acute
trans.put("Þ", "P"); //THORN (isländisch)
trans.put("ß", "ss"); //scharfes S
trans.put("à", "a"); //a mit Accent grave
trans.put("á", "a"); //a mit Accent acute
trans.put("â", "a"); //a mit Circumflex
trans.put("ã", "a"); //a mit Tilde
trans.put("ä", "ae"); //a Umlaut
trans.put("å", "a"); //a mit Ring
trans.put("æ", "a"); //a mit legiertem e
trans.put("ç", "c"); //c mit Häkchen
trans.put("è", "e"); //e mit Accent grave
trans.put("é", "e"); //e mit Accent acute
trans.put("ê", "e"); //e mit Circumflex
trans.put("ë", "e"); //e Umlaut
trans.put("ì", "i"); //i mit Accent grave
trans.put("í", "i"); //i mit Accent acute
trans.put("î", "i"); //i mit Circumflex
trans.put("ï", "i"); //i Umlaut
trans.put("ð", "d"); //eth (isländisch)
trans.put("ñ", "n"); //n mit Tilde
trans.put("ò", "o"); //o mit Accent grave
trans.put("ó", "o"); //o mit Accent acute
trans.put("ô", "o"); //o mit Circumflex
trans.put("õ", "o"); //o mit Tilde
trans.put("ö", "oe"); //o Umlaut
trans.put("÷", "%"); //Divisions-Zeichen
trans.put("ø", "o"); //o mit Schrägstrich
trans.put("ù", "u"); //u mit Accent grave
trans.put("ú", "u"); //u mit Accent acute
trans.put("û", "u"); //u mit Circumflex
trans.put("ü", "ue"); //u Umlaut
trans.put("ý", "y"); //y mit Accent acute
trans.put("þ", "p"); //thorn (isländisch)
trans.put("ÿ", "y"); //y Umlaut
trans.put("Α", " Alpha "); //Alpha groß
trans.put("α", " alpha "); //alpha klein
trans.put("Β", " Beta "); //Beta groß
trans.put("β", " beta "); //beta klein
trans.put("Γ", " Gamma "); //Gamma groß
trans.put("γ", " gamma "); //gamma klein
trans.put("Δ", " Delta "); //Delta groß
trans.put("δ", " delta "); //delta klein
trans.put("Ε", " Epsilon "); //Epsilon groß
trans.put("ε", " epsilon "); //epsilon klein
trans.put("Ζ", " Zeta "); //Zeta groß
trans.put("ζ", " zeta "); //zeta klein
trans.put("Η", " Eta "); //Eta groß
trans.put("η", " eta "); //eta klein
trans.put("Θ", " Theta "); //Theta groß
trans.put("θ", " theta "); //theta klein
trans.put("Ι", " Iota "); //Iota groß
trans.put("ι", " iota "); //iota klein
trans.put("Κ", " Kappa "); //Kappa groß
trans.put("κ", " kappa "); //kappa klein
trans.put("Λ", " Lambda "); //Lambda groß
trans.put("λ", " lambda "); //lambda klein
trans.put("Μ", " Mu "); //Mu groß
trans.put("μ", " mu "); //mu klein
trans.put("Ν", " Nu "); //Nu groß
trans.put("ν", " nu "); //nu klein
trans.put("Ξ", " Xi "); //Xi groß
trans.put("ξ", " xi "); //xi klein
trans.put("Ο", " Omicron "); //Omicron groß
trans.put("ο", " omicron "); //omicron klein
trans.put("Π", " Pi "); //Pi groß
trans.put("π", " pi "); //pi klein
trans.put("Ρ", " Rho "); //Rho groß
trans.put("ρ", " rho "); //rho klein
trans.put("Σ", " Sigma "); //Sigma groß
trans.put("ς", " sigma "); //sigmaf klein
trans.put("σ", " sigma "); //sigma klein
trans.put("Τ", " Tau "); //Tau groß
trans.put("τ", " tau "); //tau klein
trans.put("Υ", " Ypsilon "); //Upsilon groß
trans.put("υ", " ypsilon "); //upsilon klein
trans.put("Φ", " Phi "); //Phi groß
trans.put("φ", " phi "); //phi klein
trans.put("Χ", " Chi "); //Chi groß
trans.put("χ", " chi "); //chi klein
trans.put("Ψ", " Psi "); //Psi groß
trans.put("ψ", " psi "); //psi klein
trans.put("Ω", " Omega "); //Omega groß
trans.put("ω", " omega "); //omega klein
trans.put("ϑ", " theta "); //theta Symbol
trans.put("ϒ", " ypsilon "); //upsilon mit Haken
trans.put("ϖ", " pi "); //pi Symbol
trans.put("∀", " for all "); //für alle
trans.put("∂", " part of "); //teilweise
trans.put("∃", " exists "); //existiert
trans.put("∅", " null "); //leer
trans.put("∇", " nabla "); //nabla
trans.put("∈", " element of "); //Element von
trans.put("∉", " not element of "); //kein Element von
trans.put("∋", " contains "); //enthält als Element
trans.put("∏", " product "); //Produkt
trans.put("∑", " sum "); //Summe
trans.put("−", " minus "); //minus
trans.put("∗", " times "); //Asterisk
trans.put("√", " sqare root "); //Quadratwurzel
trans.put("∝", " proportional to "); //proportional zu
trans.put("∞", " unlimited "); //unendlich
trans.put("∠", " angle "); //Winkel
trans.put("∧", " and "); //und
trans.put("∨", " or "); //oder
trans.put("∩", " "); //Schnittpunkt
trans.put("∪", " unity "); //Einheit
trans.put("∫", " integral "); //Integral
trans.put("∴", " cause "); //deshalb
trans.put("∼", " similar to "); //ähnlich wie
trans.put("≅", " equal "); //annähernd gleich
trans.put("≈", " equal "); //beinahe gleich
trans.put("≠", " not equal "); //ungleich
trans.put("≡", " identical "); //identisch mit
trans.put("≤", " smaller or equal than "); //kleiner gleich
trans.put("≥", " greater or equal than "); //größer gleich
trans.put("⊂", " subset of "); //Untermenge von
trans.put("⊃", " superset of "); //Obermenge von
trans.put("⊄", " not subset of "); //keine Untermenge von
trans.put("⊆", ""); //Untermenge von oder gleich mit
trans.put("⊇", ""); //Obermenge von oder gleich mit
trans.put("⊕", ""); //Direktsumme
trans.put("⊗", ""); //Vektorprodukt
trans.put("⊥", ""); //senkrecht zu
trans.put("⋅", ""); //Punkt-Operator
trans.put("◊", ""); //Raute
trans.put("⌈", ""); //links oben
trans.put("⌉", ""); //rechts oben
trans.put("⌊", ""); //links unten
trans.put("⌋", ""); //rechts unten
trans.put("〈", ""); //spitze Klammer links
trans.put("〉", ""); //spitze Klammer rechts
trans.put("←", ""); //Pfeil links
trans.put("↑", ""); //Pfeil oben
trans.put("→", ""); //Pfeil rechts
trans.put("↓", ""); //Pfeil unten
trans.put("↔", ""); //Pfeil links/rechts
trans.put("↵", ""); //Pfeil unten-Knick-links
trans.put("⇐", ""); //Doppelpfeil links
trans.put("⇑", ""); //Doppelpfeil oben
trans.put("⇒", ""); //Doppelpfeil rechts
trans.put("⇓", ""); //Doppelpfeil unten
trans.put("⇔", ""); //Doppelpfeil links/rechts
trans.put("•", ""); //Bullet-Zeichen
trans.put("…", ""); //Horizontale Ellipse
trans.put("′", ""); //Minutenzeichen
trans.put("‾", ""); //Überstrich
trans.put("⁄", ""); //Bruchstrich
trans.put("℘", ""); //Weierstrass p
trans.put("ℑ", ""); //Zeichen für "imaginär"
trans.put("ℜ", ""); //Zeichen für "real"
trans.put("™", ""); //Trademark-Zeichen
trans.put("€", ""); //Euro-Zeichen
trans.put("ℵ", ""); //Alef-Symbol
trans.put("♠", ""); //Pik-Zeichen
trans.put("♣", ""); //Kreuz-Zeichen
trans.put("♥", ""); //Herz-Zeichen
trans.put("♦", ""); //Karo-Zeichen
trans.put(" ", ""); //Leerzeichen Breite n
trans.put(" ", ""); //Leerzeichen Breite m
trans.put(" ", ""); //Schmales Leerzeichen
trans.put("", ""); //null breiter Nichtverbinder
trans.put("", ""); //null breiter Verbinder
trans.put("", ""); //links-nach-rechts-Zeichen
trans.put("", ""); //rechts-nach-links-Zeichen
trans.put("–", ""); //Gedankenstrich Breite n
trans.put("—", ""); //Gedankenstrich Breite m
trans.put("‘", ""); //einfaches Anführungszeichen links
trans.put("’", ""); //einfaches Anführungszeichen rechts
trans.put("‚", ""); //einfaches low-9-Zeichen
trans.put("“", ""); //doppeltes Anführungszeichen links
trans.put("”", ""); //doppeltes Anführungszeichen rechts
trans.put("„", ""); //doppeltes low-9-Zeichen rechts
trans.put("†", ""); //Kreuz
trans.put("‡", ""); //Doppelkreuz
trans.put("‰", ""); //zu tausend
trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links
trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts
}
public htmlFilterAbstractScraper(TreeSet tags0, TreeSet tags1) {
this.tags0 = tags0;
this.tags1 = tags1;
}
public boolean isTag0(String tag) {
return (tags0 != null) && (tags0.contains(tag));
}
public boolean isTag1(String tag) {
return (tags1 != null) && (tags1.contains(tag));
}
//the 'missing' method that shall be implemented:
public abstract void scrapeText(char[] text);
// the other methods must take into account to construct the return value correctly
public abstract void scrapeTag0(String tagname, Properties tagopts);
public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text);
// string conversions
private static String code_iso8859s(char c) {
switch (c) {
// german umlaute and ligaturen
case 0xc4: return "AE"; case 0xd6: return "OE"; case 0xdc: return "UE";
case 0xe4: return "ae"; case 0xf6: return "oe"; case 0xfc: return "ue";
case 0xdf: return "ss";
// accent on letters; i.e. french characters
case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc5: return "A";
case 0xc6: return "AE";
case 0xc7: return "C";
case 0xc8: case 0xc9: case 0xca: return "E";
case 0xcc: case 0xcd: case 0xce: case 0xcf: return "I";
case 0xd0: return "D";
case 0xd1: return "N";
case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd8: return "O";
case 0xd7: return "x";
case 0xd9: case 0xda: case 0xdb: return "U";
case 0xdd: return "Y";
case 0xde: return "p";
case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe5: return "a";
case 0xe6: return "ae";
case 0xe7: return "c";
case 0xe8: case 0xe9: case 0xea: return "e";
case 0xec: case 0xed: case 0xee: case 0xef: return "i";
case 0xf0: return "d";
case 0xf1: return "n";
case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf8: return "o";
case 0xf7: return "%";
case 0xf9: case 0xfa: case 0xfb: return "u";
case 0xfd: case 0xff: return "y";
case 0xfe: return "p";
// special characters
case 0xa4: return " euro ";
default: return null;
}
}
public static serverCharBuffer convertUmlaute(serverCharBuffer bb) {
if (bb.length() == 0) return bb;
serverCharBuffer t = new serverCharBuffer(bb.length() + 20);
char c;
for (int i = 0; i < bb.length(); i++) {
c = bb.charAt(i);
String z = code_iso8859s(c);
if (z == null) t.append(c);
else t.append(z);
}
return t;
// serverByteBuffer t = new serverByteBuffer(bb.length() + 20);
// int b0, b1, b2;
// String z;
// int i = 0;
// while (i < bb.length()) {
// b0 = bb.byteAt(i) & 0xff;
// // check utf-8 encoding
// if ((b0 < 128) || (i + 1 == bb.length())) {
// t.append(b0);
// i++;
// } else {
// b1 = bb.byteAt(i + 1) & 0xff;
// if (b1 > 0x3f) {
// z = code_iso8859s(b0);
// i++;
// } else if ((b0 > 0xbf) && (b0 < 0xe0)) {
// z = code_iso8859s(((b0 & 0x1f) << 0x6) | (b1 & 0x3f));
// i += 2;
// } else {
// if (i + 2 >= bb.length()) {
// z = null;
// i++;
// } else {
// b2 = bb.byteAt(i + 2) & 0xff;
// if (b2 > 0x3f) {
// z = code_iso8859s(b0);
// i++;
// } else {
// z = code_iso8859s(((b0 & 0xf) << 0xc) | ((b1 & 0x3f) << 0x6) | (b2 & 0x3f));
// i += 3;
// }
// }
// }
// if (z == null) t.append(b0); else t.append(z);
// }
// }
// return t;
}
private static char[] transscript(char[] code) {
String t = (String) trans.get(new String(code));
if (t == null) return new char[0];
return t.toCharArray();
}
protected static serverCharBuffer transscriptAll(serverCharBuffer bb) {
int p0 = 0, p1;
char[] t;
while ((p0 = bb.indexOf('&', p0)) >= 0) {
p1 = bb.indexOf(';', p0);
if (p1 >= 0) {
t = transscript(bb.getChars(p0, p1 + 1));
bb = new serverCharBuffer(bb.getChars(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getChars(p1 + 1));
} else {
bb = new serverCharBuffer(bb.getChars(0, p0), bb.length()).append(bb.getChars(p0 + 1));
}
}
t = null;
return bb;
}
protected static serverCharBuffer stripAllTags(serverCharBuffer bb) {
int p0 = 0, p1;
while ((p0 = bb.indexOf(lb, p0)) >= 0) {
p1 = bb.indexOf(rb, p0);
if (p1 >= 0) {
bb = ((serverCharBuffer)new serverCharBuffer(bb.getChars(0, p0), bb.length() + p0 - p1 + 1).trim().append(32)).append(new serverCharBuffer(bb.getChars(p1 + 1)).trim());
} else {
bb = new serverCharBuffer(bb.getChars(0, p0), bb.length()).trim().append(new serverCharBuffer(bb.getChars(p0 + 1)).trim());
}
}
return bb.trim();
}
public static serverCharBuffer stripAll(serverCharBuffer bb) {
//return stripAllTags(s);
return convertUmlaute(transscriptAll(stripAllTags(bb)));
}
public void close() {
// free resources
tags0 = null;
tags1 = null;
}
public void finalize() {
close();
}
}
\ No newline at end of file
+// htmlFilterAbstractScraper.java
// ---------------------------
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
// last major change: 18.02.2004
//
// You agree that the Author(s) is (are) not responsible for cost,
// loss of data or any harm that may be caused by usage of this softare or
// this documentation. The usage of this software is on your own risk. The
// installation and usage (starting/running) of this software may allow other
// people or application to access your computer and any attached devices and
// is highly dependent on the configuration of the software which must be
// done by the user of the software;the author(s) is (are) also
// not responsible for proper configuration and usage of the software, even
// if provoked by documentation provided together with the software.
//
// THE SOFTWARE THAT FOLLOWS AS ART OF PROGRAMMING BELOW THIS SECTION
// IS PUBLISHED UNDER THE GPL AS DOCUMENTED IN THE FILE gpl.txt ASIDE THIS
// FILE AND AS IN http://www.gnu.org/licenses/gpl.txt
// ANY CHANGES TO THIS FILE ACCORDING TO THE GPL CAN BE DONE TO THE
// LINES THAT FOLLOWS THIS COPYRIGHT NOTICE HERE, BUT CHANGES MUST NOT
// BE DONE ABOVE OR INSIDE THE COPYRIGHT NOTICE. A RE-DISTRIBUTION
// MUST CONTAIN THE INTACT AND UNCHANGED COPYRIGHT NOTICE.
// CONTRIBUTIONS AND CHANGES TO THE PROGRAM CODE SHOULD BE MARKED AS SUCH.
package de.anomic.htmlFilter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Properties;
import java.util.TreeSet;
import de.anomic.server.serverCharBuffer;
public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
public static final char lb = '<';
public static final char rb = '>';
public static final char sl = '/';
private TreeSet tags0;
private TreeSet tags1;
// define a translation table for html character codings
private static HashMap trans = new HashMap(300);
static {
trans.put(""", "\""); //Anführungszeichen oben
trans.put("&", "&"); //Ampersand-Zeichen, kaufmännisches Und
trans.put("<", "<"); //öffnende spitze Klammer
trans.put(">", ">"); //schließende spitze Klammer
trans.put(" ", " "); //Erzwungenes Leerzeichen
trans.put("¡", "!"); //umgekehrtes Ausrufezeichen
trans.put("¢", " cent "); //Cent-Zeichen
trans.put("£", " pound "); //Pfund-Zeichen
trans.put("¤", " currency "); //Währungs-Zeichen
trans.put("¥", " yen "); //Yen-Zeichen
trans.put("¦", " "); //durchbrochener Strich
trans.put("§", " paragraph "); //Paragraph-Zeichen
trans.put("¨", " "); //Pünktchen oben
trans.put("©", " copyright "); //Copyright-Zeichen
trans.put("ª", " "); //Ordinal-Zeichen weiblich
trans.put("«", " "); //angewinkelte Anführungszeichen links
trans.put("¬", " not "); //Verneinungs-Zeichen
trans.put("", "-"); //kurzer Trennstrich
trans.put("®", " trademark "); //Registriermarke-Zeichen
trans.put("¯", " "); //Überstrich
trans.put("°", " degree "); //Grad-Zeichen
trans.put("±", " +/- "); //Plusminus-Zeichen
trans.put("²", " square "); //Hoch-2-Zeichen
trans.put("³", " 3 "); //Hoch-3-Zeichen
trans.put("´", " "); //Acute-Zeichen
trans.put("µ", " micro "); //Mikro-Zeichen
trans.put("¶", " paragraph "); //Absatz-Zeichen
trans.put("·", " "); //Mittelpunkt
trans.put("¸", " "); //Häkchen unten
trans.put("¹", " "); //Hoch-1-Zeichen
trans.put("º", " degree "); //Ordinal-Zeichen männlich
trans.put("»", " "); //angewinkelte Anführungszeichen rechts
trans.put("¼", " quarter "); //ein Viertel
trans.put("½", " half "); //ein Halb
trans.put("¾", " 3/4 "); //drei Viertel
trans.put("¿", "?"); //umgekehrtes Fragezeichen
trans.put("À", "A"); //A mit Accent grave
trans.put("Á", "A"); //A mit Accent acute
trans.put("Â", "A"); //A mit Circumflex
trans.put("Ã", "A"); //A mit Tilde
trans.put("Ä", "Ae"); //A Umlaut
trans.put("Å", "A"); //A mit Ring
trans.put("Æ", "A"); //A mit legiertem E
trans.put("Ç", "C"); //C mit Häkchen
trans.put("È", "E"); //E mit Accent grave
trans.put("É", "E"); //E mit Accent acute
trans.put("Ê", "E"); //E mit Circumflex
trans.put("Ë", "E"); //E Umlaut
trans.put("Ì", "I"); //I mit Accent grave
trans.put("Í", "I"); //I mit Accent acute
trans.put("Î", "I"); //I mit Circumflex
trans.put("Ï", "I"); //I Umlaut
trans.put("Ð", "D"); //Eth (isländisch)
trans.put("Ñ", "N"); //N mit Tilde
trans.put("Ò", "O"); //O mit Accent grave
trans.put("Ó", "O"); //O mit Accent acute
trans.put("Ô", "O"); //O mit Circumflex
trans.put("Õ", "O"); //O mit Tilde
trans.put("Ö", "Oe"); //O Umlaut
trans.put("×", " times "); //Mal-Zeichen
trans.put("Ø", "O"); //O mit Schrägstrich
trans.put("Ù", "U"); //U mit Accent grave
trans.put("Ú", "U"); //U mit Accent acute
trans.put("Û", "U"); //U mit Circumflex
trans.put("Ü", "Ue"); //U Umlaut
trans.put("Ý", "Y"); //Y mit Accent acute
trans.put("Þ", "P"); //THORN (isländisch)
trans.put("ß", "ss"); //scharfes S
trans.put("à", "a"); //a mit Accent grave
trans.put("á", "a"); //a mit Accent acute
trans.put("â", "a"); //a mit Circumflex
trans.put("ã", "a"); //a mit Tilde
trans.put("ä", "ae"); //a Umlaut
trans.put("å", "a"); //a mit Ring
trans.put("æ", "a"); //a mit legiertem e
trans.put("ç", "c"); //c mit Häkchen
trans.put("è", "e"); //e mit Accent grave
trans.put("é", "e"); //e mit Accent acute
trans.put("ê", "e"); //e mit Circumflex
trans.put("ë", "e"); //e Umlaut
trans.put("ì", "i"); //i mit Accent grave
trans.put("í", "i"); //i mit Accent acute
trans.put("î", "i"); //i mit Circumflex
trans.put("ï", "i"); //i Umlaut
trans.put("ð", "d"); //eth (isländisch)
trans.put("ñ", "n"); //n mit Tilde
trans.put("ò", "o"); //o mit Accent grave
trans.put("ó", "o"); //o mit Accent acute
trans.put("ô", "o"); //o mit Circumflex
trans.put("õ", "o"); //o mit Tilde
trans.put("ö", "oe"); //o Umlaut
trans.put("÷", "%"); //Divisions-Zeichen
trans.put("ø", "o"); //o mit Schrägstrich
trans.put("ù", "u"); //u mit Accent grave
trans.put("ú", "u"); //u mit Accent acute
trans.put("û", "u"); //u mit Circumflex
trans.put("ü", "ue"); //u Umlaut
trans.put("ý", "y"); //y mit Accent acute
trans.put("þ", "p"); //thorn (isländisch)
trans.put("ÿ", "y"); //y Umlaut
trans.put("Α", " Alpha "); //Alpha groß
trans.put("α", " alpha "); //alpha klein
trans.put("Β", " Beta "); //Beta groß
trans.put("β", " beta "); //beta klein
trans.put("Γ", " Gamma "); //Gamma groß
trans.put("γ", " gamma "); //gamma klein
trans.put("Δ", " Delta "); //Delta groß
trans.put("δ", " delta "); //delta klein
trans.put("Ε", " Epsilon "); //Epsilon groß
trans.put("ε", " epsilon "); //epsilon klein
trans.put("Ζ", " Zeta "); //Zeta groß
trans.put("ζ", " zeta "); //zeta klein
trans.put("Η", " Eta "); //Eta groß
trans.put("η", " eta "); //eta klein
trans.put("Θ", " Theta "); //Theta groß
trans.put("θ", " theta "); //theta klein
trans.put("Ι", " Iota "); //Iota groß
trans.put("ι", " iota "); //iota klein
trans.put("Κ", " Kappa "); //Kappa groß
trans.put("κ", " kappa "); //kappa klein
trans.put("Λ", " Lambda "); //Lambda groß
trans.put("λ", " lambda "); //lambda klein
trans.put("Μ", " Mu "); //Mu groß
trans.put("μ", " mu "); //mu klein
trans.put("Ν", " Nu "); //Nu groß
trans.put("ν", " nu "); //nu klein
trans.put("Ξ", " Xi "); //Xi groß
trans.put("ξ", " xi "); //xi klein
trans.put("Ο", " Omicron "); //Omicron groß
trans.put("ο", " omicron "); //omicron klein
trans.put("Π", " Pi "); //Pi groß
trans.put("π", " pi "); //pi klein
trans.put("Ρ", " Rho "); //Rho groß
trans.put("ρ", " rho "); //rho klein
trans.put("Σ", " Sigma "); //Sigma groß
trans.put("ς", " sigma "); //sigmaf klein
trans.put("σ", " sigma "); //sigma klein
trans.put("Τ", " Tau "); //Tau groß
trans.put("τ", " tau "); //tau klein
trans.put("Υ", " Ypsilon "); //Upsilon groß
trans.put("υ", " ypsilon "); //upsilon klein
trans.put("Φ", " Phi "); //Phi groß
trans.put("φ", " phi "); //phi klein
trans.put("Χ", " Chi "); //Chi groß
trans.put("χ", " chi "); //chi klein
trans.put("Ψ", " Psi "); //Psi groß
trans.put("ψ", " psi "); //psi klein
trans.put("Ω", " Omega "); //Omega groß
trans.put("ω", " omega "); //omega klein
trans.put("ϑ", " theta "); //theta Symbol
trans.put("ϒ", " ypsilon "); //upsilon mit Haken
trans.put("ϖ", " pi "); //pi Symbol
trans.put("∀", " for all "); //für alle
trans.put("∂", " part of "); //teilweise
trans.put("∃", " exists "); //existiert
trans.put("∅", " null "); //leer
trans.put("∇", " nabla "); //nabla
trans.put("∈", " element of "); //Element von
trans.put("∉", " not element of "); //kein Element von
trans.put("∋", " contains "); //enthält als Element
trans.put("∏", " product "); //Produkt
trans.put("∑", " sum "); //Summe
trans.put("−", " minus "); //minus
trans.put("∗", " times "); //Asterisk
trans.put("√", " sqare root "); //Quadratwurzel
trans.put("∝", " proportional to "); //proportional zu
trans.put("∞", " unlimited "); //unendlich
trans.put("∠", " angle "); //Winkel
trans.put("∧", " and "); //und
trans.put("∨", " or "); //oder
trans.put("∩", " "); //Schnittpunkt
trans.put("∪", " unity "); //Einheit
trans.put("∫", " integral "); //Integral
trans.put("∴", " cause "); //deshalb
trans.put("∼", " similar to "); //ähnlich wie
trans.put("≅", " equal "); //annähernd gleich
trans.put("≈", " equal "); //beinahe gleich
trans.put("≠", " not equal "); //ungleich
trans.put("≡", " identical "); //identisch mit
trans.put("≤", " smaller or equal than "); //kleiner gleich
trans.put("≥", " greater or equal than "); //größer gleich
trans.put("⊂", " subset of "); //Untermenge von
trans.put("⊃", " superset of "); //Obermenge von
trans.put("⊄", " not subset of "); //keine Untermenge von
trans.put("⊆", ""); //Untermenge von oder gleich mit
trans.put("⊇", ""); //Obermenge von oder gleich mit
trans.put("⊕", ""); //Direktsumme
trans.put("⊗", ""); //Vektorprodukt
trans.put("⊥", ""); //senkrecht zu
trans.put("⋅", ""); //Punkt-Operator
trans.put("◊", ""); //Raute
trans.put("⌈", ""); //links oben
trans.put("⌉", ""); //rechts oben
trans.put("⌊", ""); //links unten
trans.put("⌋", ""); //rechts unten
trans.put("〈", ""); //spitze Klammer links
trans.put("〉", ""); //spitze Klammer rechts
trans.put("←", ""); //Pfeil links
trans.put("↑", ""); //Pfeil oben
trans.put("→", ""); //Pfeil rechts
trans.put("↓", ""); //Pfeil unten
trans.put("↔", ""); //Pfeil links/rechts
trans.put("↵", ""); //Pfeil unten-Knick-links
trans.put("⇐", ""); //Doppelpfeil links
trans.put("⇑", ""); //Doppelpfeil oben
trans.put("⇒", ""); //Doppelpfeil rechts
trans.put("⇓", ""); //Doppelpfeil unten
trans.put("⇔", ""); //Doppelpfeil links/rechts
trans.put("•", ""); //Bullet-Zeichen
trans.put("…", ""); //Horizontale Ellipse
trans.put("′", ""); //Minutenzeichen
trans.put("‾", ""); //Überstrich
trans.put("⁄", ""); //Bruchstrich
trans.put("℘", ""); //Weierstrass p
trans.put("ℑ", ""); //Zeichen für "imaginär"
trans.put("ℜ", ""); //Zeichen für "real"
trans.put("™", ""); //Trademark-Zeichen
trans.put("€", ""); //Euro-Zeichen
trans.put("ℵ", ""); //Alef-Symbol
trans.put("♠", ""); //Pik-Zeichen
trans.put("♣", ""); //Kreuz-Zeichen
trans.put("♥", ""); //Herz-Zeichen
trans.put("♦", ""); //Karo-Zeichen
trans.put(" ", ""); //Leerzeichen Breite n
trans.put(" ", ""); //Leerzeichen Breite m
trans.put(" ", ""); //Schmales Leerzeichen
trans.put("", ""); //null breiter Nichtverbinder
trans.put("", ""); //null breiter Verbinder
trans.put("", ""); //links-nach-rechts-Zeichen
trans.put("", ""); //rechts-nach-links-Zeichen
trans.put("–", ""); //Gedankenstrich Breite n
trans.put("—", ""); //Gedankenstrich Breite m
trans.put("‘", ""); //einfaches Anführungszeichen links
trans.put("’", ""); //einfaches Anführungszeichen rechts
trans.put("‚", ""); //einfaches low-9-Zeichen
trans.put("“", ""); //doppeltes Anführungszeichen links
trans.put("”", ""); //doppeltes Anführungszeichen rechts
trans.put("„", ""); //doppeltes low-9-Zeichen rechts
trans.put("†", ""); //Kreuz
trans.put("‡", ""); //Doppelkreuz
trans.put("‰", ""); //zu tausend
trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links
trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts
}
public htmlFilterAbstractScraper(TreeSet tags0, TreeSet tags1) {
this.tags0 = tags0;
this.tags1 = tags1;
}
public boolean isTag0(String tag) {
return (tags0 != null) && (tags0.contains(tag));
}
public boolean isTag1(String tag) {
return (tags1 != null) && (tags1.contains(tag));
}
//the 'missing' method that shall be implemented:
public abstract void scrapeText(char[] text);
// the other methods must take into account to construct the return value correctly
public abstract void scrapeTag0(String tagname, Properties tagopts);
public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text);
// string conversions
private static String code_iso8859s(char c) {
switch (c) {
// german umlaute and ligaturen
case 0xc4: return "AE"; case 0xd6: return "OE"; case 0xdc: return "UE";
case 0xe4: return "ae"; case 0xf6: return "oe"; case 0xfc: return "ue";
case 0xdf: return "ss";
// accent on letters; i.e. french characters
case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc5: return "A";
case 0xc6: return "AE";
case 0xc7: return "C";
case 0xc8: case 0xc9: case 0xca: return "E";
case 0xcc: case 0xcd: case 0xce: case 0xcf: return "I";
case 0xd0: return "D";
case 0xd1: return "N";
case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd8: return "O";
case 0xd7: return "x";
case 0xd9: case 0xda: case 0xdb: return "U";
case 0xdd: return "Y";
case 0xde: return "p";
case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe5: return "a";
case 0xe6: return "ae";
case 0xe7: return "c";
case 0xe8: case 0xe9: case 0xea: return "e";
case 0xec: case 0xed: case 0xee: case 0xef: return "i";
case 0xf0: return "d";
case 0xf1: return "n";
case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf8: return "o";
case 0xf7: return "%";
case 0xf9: case 0xfa: case 0xfb: return "u";
case 0xfd: case 0xff: return "y";
case 0xfe: return "p";
// special characters
case 0xa4: return " euro ";
default: return null;
}
}
public static serverCharBuffer convertUmlaute(serverCharBuffer bb) {
if (bb.length() == 0) return bb;
try {
serverCharBuffer t = new serverCharBuffer(bb.length() + 20);
char c;
for (int i = 0; i < bb.length(); i++) {
c = bb.charAt(i);
String z = code_iso8859s(c);
if (z == null) t.append(c);
else t.append(z);
}
return t;
} catch (IOException e) {
// ignore this
return null;
}
// serverByteBuffer t = new serverByteBuffer(bb.length() + 20);
// int b0, b1, b2;
// String z;
// int i = 0;
// while (i < bb.length()) {
// b0 = bb.byteAt(i) & 0xff;
// // check utf-8 encoding
// if ((b0 < 128) || (i + 1 == bb.length())) {
// t.append(b0);
// i++;
// } else {
// b1 = bb.byteAt(i + 1) & 0xff;
// if (b1 > 0x3f) {
// z = code_iso8859s(b0);
// i++;
// } else if ((b0 > 0xbf) && (b0 < 0xe0)) {
// z = code_iso8859s(((b0 & 0x1f) << 0x6) | (b1 & 0x3f));
// i += 2;
// } else {
// if (i + 2 >= bb.length()) {
// z = null;
// i++;
// } else {
// b2 = bb.byteAt(i + 2) & 0xff;
// if (b2 > 0x3f) {
// z = code_iso8859s(b0);
// i++;
// } else {
// z = code_iso8859s(((b0 & 0xf) << 0xc) | ((b1 & 0x3f) << 0x6) | (b2 & 0x3f));
// i += 3;
// }
// }
// }
// if (z == null) t.append(b0); else t.append(z);
// }
// }
// return t;
}
private static char[] transscript(char[] code) {
String t = (String) trans.get(new String(code));
if (t == null) return new char[0];
return t.toCharArray();
}
protected static serverCharBuffer transscriptAll(serverCharBuffer bb) {
int p0 = 0, p1;
char[] t;
while ((p0 = bb.indexOf('&', p0)) >= 0) {
p1 = bb.indexOf(';', p0);
if (p1 >= 0) {
t = transscript(bb.getChars(p0, p1 + 1));
bb = new serverCharBuffer(bb.getChars(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getChars(p1 + 1));
} else {
bb = new serverCharBuffer(bb.getChars(0, p0), bb.length()).append(bb.getChars(p0 + 1));
}
}
t = null;
return bb;
}
protected static serverCharBuffer stripAllTags(serverCharBuffer bb) {
int p0 = 0, p1;
while ((p0 = bb.indexOf(lb, p0)) >= 0) {
p1 = bb.indexOf(rb, p0);
if (p1 >= 0) {
bb = ((serverCharBuffer)new serverCharBuffer(bb.getChars(0, p0), bb.length() + p0 - p1 + 1).trim().append(32)).append(new serverCharBuffer(bb.getChars(p1 + 1)).trim());
} else {
bb = new serverCharBuffer(bb.getChars(0, p0), bb.length()).trim().append(new serverCharBuffer(bb.getChars(p0 + 1)).trim());
}
}
return bb.trim();
}
public static serverCharBuffer stripAll(serverCharBuffer bb) {
//return stripAllTags(s);
return convertUmlaute(transscriptAll(stripAllTags(bb)));
}
public void close() {
// free resources
tags0 = null;
tags1 = null;
}
public void finalize() {
close();
}
}
\ No newline at end of file
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index b41d97ca6..2252f3844 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -43,6 +43,9 @@
package de.anomic.htmlFilter;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
@@ -57,7 +60,9 @@ import java.util.Properties;
import java.util.TreeSet;
import de.anomic.net.URL;
+import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverCharBuffer;
+import de.anomic.server.serverFileUtils;
public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper {
@@ -113,6 +118,23 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
this.content = new serverCharBuffer(1024);
}
+ public void transformCharset(String oldCharset, String newCharset) throws UnsupportedEncodingException {
+// // convert the content back to the old bytearray
+// ByteArrayInputStream temp = new ByteArrayInputStream(new String(this.content.getChars()).getBytes(oldCharset));
+//
+// // create a reader with the new charset
+// serverCharBuffer newContent = new serverCharBuffer(this.content.length());
+// try {
+// InputStreamReader reader = new InputStreamReader(temp,newCharset);
+// serverFileUtils.copy(reader, newContent);
+// reader.close();
+// } catch (IOException e) {
+// // ignore this
+// }
+//
+// this.content = newContent;
+ }
+
public void scrapeText(char[] newtext) {
// System.out.println("SCRAPE: " + new String(newtext));
if ((content.length() != 0) && (content.charAt(content.length() - 1) != 32)) content.append(32);
@@ -246,10 +268,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
for (int j = 0; j < headlines[i - 1].size(); j++) s[j] = (String) headlines[i - 1].get(j);
return s;
}
-
+
public byte[] getText() {
+ return this.getText("UTF-8");
+ }
+
+ public byte[] getText(String charSet) {
try {
- return content.toString().getBytes("UTF-8");
+ return content.toString().getBytes(charSet);
} catch (UnsupportedEncodingException e) {
return content.toString().getBytes();
}
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
index d39a0b9fc..1f20f51db 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
@@ -46,6 +46,7 @@ package de.anomic.htmlFilter;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
+import java.io.IOException;
import java.text.Collator;
import java.util.ArrayList;
import java.util.Locale;
@@ -112,14 +113,19 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
}
private static char[] genBlueLetters(int length) {
- serverCharBuffer bb = new serverCharBuffer(" ".toCharArray());
- length = length / 2;
- if (length > 10) length = 7;
- while (length-- > 0) {
- bb.append('X');
+ try {
+ serverCharBuffer bb = new serverCharBuffer(" ".toCharArray());
+ length = length / 2;
+ if (length > 10) length = 7;
+ while (length-- > 0) {
+ bb.append('X');
+ }
+ bb.append(" ");
+ return bb.getChars();
+ } catch (IOException e) {
+ // ignore this.
+ return null;
}
- bb.append(" ");
- return bb.getChars();
}
private boolean bluelistHit(char[] text) {
diff --git a/source/de/anomic/htmlFilter/htmlFilterWriter.java b/source/de/anomic/htmlFilter/htmlFilterWriter.java
index a232be49a..5426b696e 100644
--- a/source/de/anomic/htmlFilter/htmlFilterWriter.java
+++ b/source/de/anomic/htmlFilter/htmlFilterWriter.java
@@ -117,67 +117,92 @@ public final class htmlFilterWriter extends Writer {
}
public static char[] genTag0raw(String tagname, boolean opening, char[] tagopts) {
- serverCharBuffer bb = new serverCharBuffer(tagname.length() + tagopts.length + 3);
- bb.append('<');
- if (!opening) {
- bb.append('/');
- }
- bb.append(tagname);
- if (tagopts.length > 0) {
-// if (tagopts[0] == (byte) 32)
- bb.append(tagopts);
-// else bb.append((byte) 32).append(tagopts);
+ try {
+ serverCharBuffer bb = new serverCharBuffer(tagname.length() + tagopts.length + 3);
+ bb.append('<');
+ if (!opening) {
+ bb.append('/');
+ }
+ bb.append(tagname);
+ if (tagopts.length > 0) {
+// if (tagopts[0] == (byte) 32)
+ bb.append(tagopts);
+// else bb.append((byte) 32).append(tagopts);
+ }
+ bb.append('>');
+ return bb.getChars();
+ } catch (IOException e) {
+ // ignore this
+ return null;
}
- bb.append('>');
- return bb.getChars();
}
public static char[] genTag1raw(String tagname, char[] tagopts, char[] text) {
- serverCharBuffer bb = new serverCharBuffer(2 * tagname.length() + tagopts.length + text.length + 5);
- bb.append('<').append(tagname);
- if (tagopts.length > 0) {
-// if (tagopts[0] == (byte) 32)
- bb.append(tagopts);
-// else bb.append((byte) 32).append(tagopts);
+ try {
+ serverCharBuffer bb = new serverCharBuffer(2 * tagname.length() + tagopts.length + text.length + 5);
+ bb.append('<').append(tagname);
+ if (tagopts.length > 0) {
+// if (tagopts[0] == (byte) 32)
+ bb.append(tagopts);
+// else bb.append((byte) 32).append(tagopts);
+ }
+ bb.append('>');
+ bb.append(text);
+ bb.append('<').append('/').append(tagname).append('>');
+ return bb.getChars();
+ } catch (IOException e) {
+ // ignore this
+ return null;
}
- bb.append('>');
- bb.append(text);
- bb.append('<').append('/').append(tagname).append('>');
- return bb.getChars();
}
- public static char[] genTag0(String tagname, Properties tagopts, char quotechar) {
- char[] tagoptsx = (tagopts.size() == 0) ? null : genOpts(tagopts, quotechar);
- serverCharBuffer bb = new serverCharBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2);
- bb.append('<').append(tagname);
- if (tagoptsx != null) {
- bb.append(32);
- bb.append(tagoptsx);
- }
- bb.append('>');
- return bb.getChars();
+ public static char[] genTag0(String tagname, Properties tagopts, char quotechar) {
+ try {
+ char[] tagoptsx = (tagopts.size() == 0) ? null : genOpts(tagopts, quotechar);
+ serverCharBuffer bb = new serverCharBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2);
+ bb.append('<').append(tagname);
+ if (tagoptsx != null) {
+ bb.append(32);
+ bb.append(tagoptsx);
+ }
+ bb.append('>');
+ return bb.getChars();
+ } catch (IOException e) {
+ // ignore this
+ return null;
+ }
}
- public static char[] genTag1(String tagname, Properties tagopts, char[] text, char quotechar) {
- char[] gt0 = genTag0(tagname, tagopts, quotechar);
- serverCharBuffer cb = new serverCharBuffer(gt0, gt0.length + text.length + tagname.length() + 3);
- cb.append(text).append('<').append('/').append(tagname).append('>');
- return cb.getChars();
+ public static char[] genTag1(String tagname, Properties tagopts, char[] text, char quotechar) {
+ try {
+ char[] gt0 = genTag0(tagname, tagopts, quotechar);
+ serverCharBuffer cb = new serverCharBuffer(gt0, gt0.length + text.length + tagname.length() + 3);
+ cb.append(text).append('<').append('/').append(tagname).append('>');
+ return cb.getChars();
+ } catch (IOException e) {
+ // ignore this
+ return null;
+ }
}
// a helper method for pretty-printing of properties for html tags
- public static char[] genOpts(Properties prop, char quotechar) {
- Enumeration e = prop.propertyNames();
- serverCharBuffer bb = new serverCharBuffer(prop.size() * 40);
- String key;
- while (e.hasMoreElements()) {
- key = (String) e.nextElement();
- bb.append(32).append(key).append('=').append(quotechar);
- bb.append(prop.getProperty(key));
- bb.append(quotechar);
+ public static char[] genOpts(Properties prop, char quotechar) {
+ try {
+ Enumeration e = prop.propertyNames();
+ serverCharBuffer bb = new serverCharBuffer(prop.size() * 40);
+ String key;
+ while (e.hasMoreElements()) {
+ key = (String) e.nextElement();
+ bb.append(32).append(key).append('=').append(quotechar);
+ bb.append(prop.getProperty(key));
+ bb.append(quotechar);
+ }
+ if (bb.length() > 0) return bb.getChars(1);
+ return bb.getChars();
+ }catch (IOException e) {
+ // ignore this
+ return null;
}
- if (bb.length() > 0) return bb.getChars(1);
- return bb.getChars();
}
private char[] filterTag(String tag, boolean opening, char[] content, char quotechar) {
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index 02c360b38..17ca8e3f6 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -598,6 +598,14 @@ public final class plasmaParser {
public plasmaParserDocument transformScraper(URL location, String mimeType, String charSet, htmlFilterContentScraper scraper) {
try {
+ if (scraper.getMetas().containsKey("content-type")) {
+ String newCharset = (String) scraper.getMetas().get("content-type");
+ if (!charSet.equals(newCharset)) {
+ // TODO: transformation of content needed
+ this.theLogger.logFine("Charset transformation needed from '" + charSet + "' to '" + newCharset + "'");
+ }
+ }
+
String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length];
int p = 0;
for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];
diff --git a/source/de/anomic/server/serverCharBuffer.java b/source/de/anomic/server/serverCharBuffer.java
index 7d9687a9d..de70bb360 100644
--- a/source/de/anomic/server/serverCharBuffer.java
+++ b/source/de/anomic/server/serverCharBuffer.java
@@ -50,7 +50,7 @@ import java.io.IOException;
import java.io.Writer;
import java.util.Properties;
-public final class serverCharBuffer /* extends Writer */ {
+public final class serverCharBuffer extends Writer {
public static final char singlequote = '\'';
public static final char doublequote = '"';
@@ -164,10 +164,10 @@ public final class serverCharBuffer /* extends Writer */ {
length += le;
}
- public serverCharBuffer append(char b) {
- write(b);
- return this;
- }
+// public serverCharBuffer append(char b) {
+// write(b);
+// return this;
+// }
public serverCharBuffer append(int i) {
write((char) (i));