|
|
|
@ -29,8 +29,6 @@ package de.anomic.htmlFilter;
|
|
|
|
|
import java.util.HashSet;
|
|
|
|
|
import java.util.Properties;
|
|
|
|
|
|
|
|
|
|
import de.anomic.server.serverCharBuffer;
|
|
|
|
|
|
|
|
|
|
public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
|
|
|
|
|
|
|
|
|
|
public static final char lb = '<';
|
|
|
|
@ -40,255 +38,6 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
|
|
|
|
|
private HashSet<String> tags0;
|
|
|
|
|
private HashSet<String> tags1;
|
|
|
|
|
|
|
|
|
|
// define a translation table for html character codings
|
|
|
|
|
/*
|
|
|
|
|
private static HashMap<String, String> trans = new HashMap<String, String>(300);
|
|
|
|
|
static {
|
|
|
|
|
trans.put(""", "\""); //Anführungszeichen oben
|
|
|
|
|
trans.put("&", "&"); //Ampersand-Zeichen, kaufmännisches Und
|
|
|
|
|
trans.put("<", "<"); //öffnende spitze Klammer
|
|
|
|
|
trans.put(">", ">"); //schließende spitze Klammer
|
|
|
|
|
trans.put(" ", " "); //Erzwungenes Leerzeichen
|
|
|
|
|
trans.put("¡", "!"); //umgekehrtes Ausrufezeichen
|
|
|
|
|
trans.put("¢", " cent "); //Cent-Zeichen
|
|
|
|
|
trans.put("£", " pound "); //Pfund-Zeichen
|
|
|
|
|
trans.put("¤", " currency "); //Währungs-Zeichen
|
|
|
|
|
trans.put("¥", " yen "); //Yen-Zeichen
|
|
|
|
|
trans.put("¦", " "); //durchbrochener Strich
|
|
|
|
|
trans.put("§", " paragraph "); //Paragraph-Zeichen
|
|
|
|
|
trans.put("¨", " "); //Pünktchen oben
|
|
|
|
|
trans.put("©", " copyright "); //Copyright-Zeichen
|
|
|
|
|
trans.put("ª", " "); //Ordinal-Zeichen weiblich
|
|
|
|
|
trans.put("«", " "); //angewinkelte Anführungszeichen links
|
|
|
|
|
trans.put("¬", " not "); //Verneinungs-Zeichen
|
|
|
|
|
trans.put("­", "-"); //kurzer Trennstrich
|
|
|
|
|
trans.put("®", " trademark "); //Registriermarke-Zeichen
|
|
|
|
|
trans.put("¯", " "); //Überstrich
|
|
|
|
|
trans.put("°", " degree "); //Grad-Zeichen
|
|
|
|
|
trans.put("±", " +/- "); //Plusminus-Zeichen
|
|
|
|
|
trans.put("²", " square "); //Hoch-2-Zeichen
|
|
|
|
|
trans.put("³", " 3 "); //Hoch-3-Zeichen
|
|
|
|
|
trans.put("´", " "); //Acute-Zeichen
|
|
|
|
|
trans.put("µ", " micro "); //Mikro-Zeichen
|
|
|
|
|
trans.put("¶", " paragraph "); //Absatz-Zeichen
|
|
|
|
|
trans.put("·", " "); //Mittelpunkt
|
|
|
|
|
trans.put("¸", " "); //Häkchen unten
|
|
|
|
|
trans.put("¹", " "); //Hoch-1-Zeichen
|
|
|
|
|
trans.put("º", " degree "); //Ordinal-Zeichen männlich
|
|
|
|
|
trans.put("»", " "); //angewinkelte Anführungszeichen rechts
|
|
|
|
|
trans.put("¼", " quarter "); //ein Viertel
|
|
|
|
|
trans.put("½", " half "); //ein Halb
|
|
|
|
|
trans.put("¾", " 3/4 "); //drei Viertel
|
|
|
|
|
trans.put("¿", "?"); //umgekehrtes Fragezeichen
|
|
|
|
|
trans.put("À", "A"); //A mit Accent grave
|
|
|
|
|
trans.put("Á", "A"); //A mit Accent acute
|
|
|
|
|
trans.put("Â", "A"); //A mit Circumflex
|
|
|
|
|
trans.put("Ã", "A"); //A mit Tilde
|
|
|
|
|
trans.put("Ä", "Ae"); //A Umlaut
|
|
|
|
|
trans.put("Å", "A"); //A mit Ring
|
|
|
|
|
trans.put("Æ", "A"); //A mit legiertem E
|
|
|
|
|
trans.put("Ç", "C"); //C mit Häkchen
|
|
|
|
|
trans.put("È", "E"); //E mit Accent grave
|
|
|
|
|
trans.put("É", "E"); //E mit Accent acute
|
|
|
|
|
trans.put("Ê", "E"); //E mit Circumflex
|
|
|
|
|
trans.put("Ë", "E"); //E Umlaut
|
|
|
|
|
trans.put("Ì", "I"); //I mit Accent grave
|
|
|
|
|
trans.put("Í", "I"); //I mit Accent acute
|
|
|
|
|
trans.put("Î", "I"); //I mit Circumflex
|
|
|
|
|
trans.put("Ï", "I"); //I Umlaut
|
|
|
|
|
trans.put("Ð", "D"); //Eth (isländisch)
|
|
|
|
|
trans.put("Ñ", "N"); //N mit Tilde
|
|
|
|
|
trans.put("Ò", "O"); //O mit Accent grave
|
|
|
|
|
trans.put("Ó", "O"); //O mit Accent acute
|
|
|
|
|
trans.put("Ô", "O"); //O mit Circumflex
|
|
|
|
|
trans.put("Õ", "O"); //O mit Tilde
|
|
|
|
|
trans.put("Ö", "Oe"); //O Umlaut
|
|
|
|
|
trans.put("×", " times "); //Mal-Zeichen
|
|
|
|
|
trans.put("Ø", "O"); //O mit Schrägstrich
|
|
|
|
|
trans.put("Ù", "U"); //U mit Accent grave
|
|
|
|
|
trans.put("Ú", "U"); //U mit Accent acute
|
|
|
|
|
trans.put("Û", "U"); //U mit Circumflex
|
|
|
|
|
trans.put("Ü", "Ue"); //U Umlaut
|
|
|
|
|
trans.put("Ý", "Y"); //Y mit Accent acute
|
|
|
|
|
trans.put("Þ", "P"); //THORN (isländisch)
|
|
|
|
|
trans.put("ß", "ss"); //scharfes S
|
|
|
|
|
trans.put("à", "a"); //a mit Accent grave
|
|
|
|
|
trans.put("á", "a"); //a mit Accent acute
|
|
|
|
|
trans.put("â", "a"); //a mit Circumflex
|
|
|
|
|
trans.put("ã", "a"); //a mit Tilde
|
|
|
|
|
trans.put("ä", "ae"); //a Umlaut
|
|
|
|
|
trans.put("å", "a"); //a mit Ring
|
|
|
|
|
trans.put("æ", "a"); //a mit legiertem e
|
|
|
|
|
trans.put("ç", "c"); //c mit Häkchen
|
|
|
|
|
trans.put("è", "e"); //e mit Accent grave
|
|
|
|
|
trans.put("é", "e"); //e mit Accent acute
|
|
|
|
|
trans.put("ê", "e"); //e mit Circumflex
|
|
|
|
|
trans.put("ë", "e"); //e Umlaut
|
|
|
|
|
trans.put("ì", "i"); //i mit Accent grave
|
|
|
|
|
trans.put("í", "i"); //i mit Accent acute
|
|
|
|
|
trans.put("î", "i"); //i mit Circumflex
|
|
|
|
|
trans.put("ï", "i"); //i Umlaut
|
|
|
|
|
trans.put("ð", "d"); //eth (isländisch)
|
|
|
|
|
trans.put("ñ", "n"); //n mit Tilde
|
|
|
|
|
trans.put("ò", "o"); //o mit Accent grave
|
|
|
|
|
trans.put("ó", "o"); //o mit Accent acute
|
|
|
|
|
trans.put("ô", "o"); //o mit Circumflex
|
|
|
|
|
trans.put("õ", "o"); //o mit Tilde
|
|
|
|
|
trans.put("ö", "oe"); //o Umlaut
|
|
|
|
|
trans.put("÷", "%"); //Divisions-Zeichen
|
|
|
|
|
trans.put("ø", "o"); //o mit Schrägstrich
|
|
|
|
|
trans.put("ù", "u"); //u mit Accent grave
|
|
|
|
|
trans.put("ú", "u"); //u mit Accent acute
|
|
|
|
|
trans.put("û", "u"); //u mit Circumflex
|
|
|
|
|
trans.put("ü", "ue"); //u Umlaut
|
|
|
|
|
trans.put("ý", "y"); //y mit Accent acute
|
|
|
|
|
trans.put("þ", "p"); //thorn (isländisch)
|
|
|
|
|
trans.put("ÿ", "y"); //y Umlaut
|
|
|
|
|
trans.put("Α", " Alpha "); //Alpha groß
|
|
|
|
|
trans.put("α", " alpha "); //alpha klein
|
|
|
|
|
trans.put("Β", " Beta "); //Beta groß
|
|
|
|
|
trans.put("β", " beta "); //beta klein
|
|
|
|
|
trans.put("Γ", " Gamma "); //Gamma groß
|
|
|
|
|
trans.put("γ", " gamma "); //gamma klein
|
|
|
|
|
trans.put("Δ", " Delta "); //Delta groß
|
|
|
|
|
trans.put("δ", " delta "); //delta klein
|
|
|
|
|
trans.put("Ε", " Epsilon "); //Epsilon groß
|
|
|
|
|
trans.put("ε", " epsilon "); //epsilon klein
|
|
|
|
|
trans.put("Ζ", " Zeta "); //Zeta groß
|
|
|
|
|
trans.put("ζ", " zeta "); //zeta klein
|
|
|
|
|
trans.put("Η", " Eta "); //Eta groß
|
|
|
|
|
trans.put("η", " eta "); //eta klein
|
|
|
|
|
trans.put("Θ", " Theta "); //Theta groß
|
|
|
|
|
trans.put("θ", " theta "); //theta klein
|
|
|
|
|
trans.put("Ι", " Iota "); //Iota groß
|
|
|
|
|
trans.put("ι", " iota "); //iota klein
|
|
|
|
|
trans.put("Κ", " Kappa "); //Kappa groß
|
|
|
|
|
trans.put("κ", " kappa "); //kappa klein
|
|
|
|
|
trans.put("Λ", " Lambda "); //Lambda groß
|
|
|
|
|
trans.put("λ", " lambda "); //lambda klein
|
|
|
|
|
trans.put("Μ", " Mu "); //Mu groß
|
|
|
|
|
trans.put("μ", " mu "); //mu klein
|
|
|
|
|
trans.put("Ν", " Nu "); //Nu groß
|
|
|
|
|
trans.put("ν", " nu "); //nu klein
|
|
|
|
|
trans.put("Ξ", " Xi "); //Xi groß
|
|
|
|
|
trans.put("ξ", " xi "); //xi klein
|
|
|
|
|
trans.put("Ο", " Omicron "); //Omicron groß
|
|
|
|
|
trans.put("ο", " omicron "); //omicron klein
|
|
|
|
|
trans.put("Π", " Pi "); //Pi groß
|
|
|
|
|
trans.put("π", " pi "); //pi klein
|
|
|
|
|
trans.put("Ρ", " Rho "); //Rho groß
|
|
|
|
|
trans.put("ρ", " rho "); //rho klein
|
|
|
|
|
trans.put("Σ", " Sigma "); //Sigma groß
|
|
|
|
|
trans.put("ς", " sigma "); //sigmaf klein
|
|
|
|
|
trans.put("σ", " sigma "); //sigma klein
|
|
|
|
|
trans.put("Τ", " Tau "); //Tau groß
|
|
|
|
|
trans.put("τ", " tau "); //tau klein
|
|
|
|
|
trans.put("Υ", " Ypsilon "); //Upsilon groß
|
|
|
|
|
trans.put("υ", " ypsilon "); //upsilon klein
|
|
|
|
|
trans.put("Φ", " Phi "); //Phi groß
|
|
|
|
|
trans.put("φ", " phi "); //phi klein
|
|
|
|
|
trans.put("Χ", " Chi "); //Chi groß
|
|
|
|
|
trans.put("χ", " chi "); //chi klein
|
|
|
|
|
trans.put("Ψ", " Psi "); //Psi groß
|
|
|
|
|
trans.put("ψ", " psi "); //psi klein
|
|
|
|
|
trans.put("Ω", " Omega "); //Omega groß
|
|
|
|
|
trans.put("ω", " omega "); //omega klein
|
|
|
|
|
trans.put("ϑ", " theta "); //theta Symbol
|
|
|
|
|
trans.put("ϒ", " ypsilon "); //upsilon mit Haken
|
|
|
|
|
trans.put("ϖ", " pi "); //pi Symbol
|
|
|
|
|
trans.put("∀", " for all "); //für alle
|
|
|
|
|
trans.put("∂", " part of "); //teilweise
|
|
|
|
|
trans.put("∃", " exists "); //existiert
|
|
|
|
|
trans.put("∅", " null "); //leer
|
|
|
|
|
trans.put("∇", " nabla "); //nabla
|
|
|
|
|
trans.put("∈", " element of "); //Element von
|
|
|
|
|
trans.put("∉", " not element of "); //kein Element von
|
|
|
|
|
trans.put("∋", " contains "); //enthält als Element
|
|
|
|
|
trans.put("∏", " product "); //Produkt
|
|
|
|
|
trans.put("∑", " sum "); //Summe
|
|
|
|
|
trans.put("−", " minus "); //minus
|
|
|
|
|
trans.put("∗", " times "); //Asterisk
|
|
|
|
|
trans.put("√", " sqare root "); //Quadratwurzel
|
|
|
|
|
trans.put("∝", " proportional to "); //proportional zu
|
|
|
|
|
trans.put("∞", " unlimited "); //unendlich
|
|
|
|
|
trans.put("∠", " angle "); //Winkel
|
|
|
|
|
trans.put("∧", " and "); //und
|
|
|
|
|
trans.put("∨", " or "); //oder
|
|
|
|
|
trans.put("∩", " "); //Schnittpunkt
|
|
|
|
|
trans.put("∪", " unity "); //Einheit
|
|
|
|
|
trans.put("∫", " integral "); //Integral
|
|
|
|
|
trans.put("∴", " cause "); //deshalb
|
|
|
|
|
trans.put("∼", " similar to "); //ähnlich wie
|
|
|
|
|
trans.put("≅", " equal "); //annähernd gleich
|
|
|
|
|
trans.put("≈", " equal "); //beinahe gleich
|
|
|
|
|
trans.put("≠", " not equal "); //ungleich
|
|
|
|
|
trans.put("≡", " identical "); //identisch mit
|
|
|
|
|
trans.put("≤", " smaller or equal than "); //kleiner gleich
|
|
|
|
|
trans.put("≥", " greater or equal than "); //größer gleich
|
|
|
|
|
trans.put("⊂", " subset of "); //Untermenge von
|
|
|
|
|
trans.put("⊃", " superset of "); //Obermenge von
|
|
|
|
|
trans.put("⊄", " not subset of "); //keine Untermenge von
|
|
|
|
|
trans.put("⊆", ""); //Untermenge von oder gleich mit
|
|
|
|
|
trans.put("⊇", ""); //Obermenge von oder gleich mit
|
|
|
|
|
trans.put("⊕", ""); //Direktsumme
|
|
|
|
|
trans.put("⊗", ""); //Vektorprodukt
|
|
|
|
|
trans.put("⊥", ""); //senkrecht zu
|
|
|
|
|
trans.put("⋅", ""); //Punkt-Operator
|
|
|
|
|
trans.put("◊", ""); //Raute
|
|
|
|
|
trans.put("⌈", ""); //links oben
|
|
|
|
|
trans.put("⌉", ""); //rechts oben
|
|
|
|
|
trans.put("⌊", ""); //links unten
|
|
|
|
|
trans.put("⌋", ""); //rechts unten
|
|
|
|
|
trans.put("⟨", ""); //spitze Klammer links
|
|
|
|
|
trans.put("⟩", ""); //spitze Klammer rechts
|
|
|
|
|
trans.put("←", ""); //Pfeil links
|
|
|
|
|
trans.put("↑", ""); //Pfeil oben
|
|
|
|
|
trans.put("→", ""); //Pfeil rechts
|
|
|
|
|
trans.put("↓", ""); //Pfeil unten
|
|
|
|
|
trans.put("↔", ""); //Pfeil links/rechts
|
|
|
|
|
trans.put("↵", ""); //Pfeil unten-Knick-links
|
|
|
|
|
trans.put("⇐", ""); //Doppelpfeil links
|
|
|
|
|
trans.put("⇑", ""); //Doppelpfeil oben
|
|
|
|
|
trans.put("⇒", ""); //Doppelpfeil rechts
|
|
|
|
|
trans.put("⇓", ""); //Doppelpfeil unten
|
|
|
|
|
trans.put("⇔", ""); //Doppelpfeil links/rechts
|
|
|
|
|
trans.put("•", ""); //Bullet-Zeichen
|
|
|
|
|
trans.put("…", ""); //Horizontale Ellipse
|
|
|
|
|
trans.put("′", ""); //Minutenzeichen
|
|
|
|
|
trans.put("‾", ""); //Überstrich
|
|
|
|
|
trans.put("⁄", ""); //Bruchstrich
|
|
|
|
|
trans.put("℘", ""); //Weierstrass p
|
|
|
|
|
trans.put("ℑ", ""); //Zeichen für "imaginär"
|
|
|
|
|
trans.put("ℜ", ""); //Zeichen für "real"
|
|
|
|
|
trans.put("™", ""); //Trademark-Zeichen
|
|
|
|
|
trans.put("€", ""); //Euro-Zeichen
|
|
|
|
|
trans.put("ℵ", ""); //Alef-Symbol
|
|
|
|
|
trans.put("♠", ""); //Pik-Zeichen
|
|
|
|
|
trans.put("♣", ""); //Kreuz-Zeichen
|
|
|
|
|
trans.put("♥", ""); //Herz-Zeichen
|
|
|
|
|
trans.put("♦", ""); //Karo-Zeichen
|
|
|
|
|
trans.put(" ", ""); //Leerzeichen Breite n
|
|
|
|
|
trans.put(" ", ""); //Leerzeichen Breite m
|
|
|
|
|
trans.put(" ", ""); //Schmales Leerzeichen
|
|
|
|
|
trans.put("‌", ""); //null breiter Nichtverbinder
|
|
|
|
|
trans.put("‍", ""); //null breiter Verbinder
|
|
|
|
|
trans.put("‎", ""); //links-nach-rechts-Zeichen
|
|
|
|
|
trans.put("‏", ""); //rechts-nach-links-Zeichen
|
|
|
|
|
trans.put("–", ""); //Gedankenstrich Breite n
|
|
|
|
|
trans.put("—", ""); //Gedankenstrich Breite m
|
|
|
|
|
trans.put("‘", ""); //einfaches Anführungszeichen links
|
|
|
|
|
trans.put("’", ""); //einfaches Anführungszeichen rechts
|
|
|
|
|
trans.put("‚", ""); //einfaches low-9-Zeichen
|
|
|
|
|
trans.put("“", ""); //doppeltes Anführungszeichen links
|
|
|
|
|
trans.put("”", ""); //doppeltes Anführungszeichen rechts
|
|
|
|
|
trans.put("„", ""); //doppeltes low-9-Zeichen rechts
|
|
|
|
|
trans.put("†", ""); //Kreuz
|
|
|
|
|
trans.put("‡", ""); //Doppelkreuz
|
|
|
|
|
trans.put("‰", ""); //zu tausend
|
|
|
|
|
trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links
|
|
|
|
|
trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts
|
|
|
|
|
}
|
|
|
|
|
*/
|
|
|
|
|
/**
|
|
|
|
|
* create a scraper. the tag sets must contain tags in lowercase!
|
|
|
|
|
* @param tags0
|
|
|
|
@ -315,111 +64,26 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
|
|
|
|
|
|
|
|
|
|
public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text);
|
|
|
|
|
|
|
|
|
|
// string conversions
|
|
|
|
|
/*
|
|
|
|
|
private static String code_iso8859s(final char c) {
|
|
|
|
|
switch (c) {
|
|
|
|
|
|
|
|
|
|
// german umlaute and ligaturen
|
|
|
|
|
case 0xc4: return "AE"; case 0xd6: return "OE"; case 0xdc: return "UE";
|
|
|
|
|
case 0xe4: return "ae"; case 0xf6: return "oe"; case 0xfc: return "ue";
|
|
|
|
|
case 0xdf: return "ss";
|
|
|
|
|
|
|
|
|
|
// accent on letters; i.e. french characters
|
|
|
|
|
case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc5: return "A";
|
|
|
|
|
case 0xc6: return "AE";
|
|
|
|
|
case 0xc7: return "C";
|
|
|
|
|
case 0xc8: case 0xc9: case 0xca: return "E";
|
|
|
|
|
case 0xcc: case 0xcd: case 0xce: case 0xcf: return "I";
|
|
|
|
|
case 0xd0: return "D";
|
|
|
|
|
case 0xd1: return "N";
|
|
|
|
|
case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd8: return "O";
|
|
|
|
|
case 0xd7: return "x";
|
|
|
|
|
case 0xd9: case 0xda: case 0xdb: return "U";
|
|
|
|
|
case 0xdd: return "Y";
|
|
|
|
|
case 0xde: return "p";
|
|
|
|
|
|
|
|
|
|
case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe5: return "a";
|
|
|
|
|
case 0xe6: return "ae";
|
|
|
|
|
case 0xe7: return "c";
|
|
|
|
|
case 0xe8: case 0xe9: case 0xea: return "e";
|
|
|
|
|
case 0xec: case 0xed: case 0xee: case 0xef: return "i";
|
|
|
|
|
case 0xf0: return "d";
|
|
|
|
|
case 0xf1: return "n";
|
|
|
|
|
case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf8: return "o";
|
|
|
|
|
case 0xf7: return "%";
|
|
|
|
|
case 0xf9: case 0xfa: case 0xfb: return "u";
|
|
|
|
|
case 0xfd: case 0xff: return "y";
|
|
|
|
|
case 0xfe: return "p";
|
|
|
|
|
|
|
|
|
|
// special characters
|
|
|
|
|
case 0xa4: return " euro ";
|
|
|
|
|
default: return null;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
public static serverCharBuffer convertUmlaute(final serverCharBuffer bb) {
|
|
|
|
|
return bb; /*
|
|
|
|
|
if (bb.length() == 0) return bb;
|
|
|
|
|
|
|
|
|
|
final serverCharBuffer t = new serverCharBuffer(bb.length() + 20);
|
|
|
|
|
char c;
|
|
|
|
|
for (int i = 0; i < bb.length(); i++) {
|
|
|
|
|
c = bb.charAt(i);
|
|
|
|
|
final String z = code_iso8859s(c);
|
|
|
|
|
if (z == null) t.append((int)c);
|
|
|
|
|
else t.append(z);
|
|
|
|
|
}
|
|
|
|
|
return t;
|
|
|
|
|
*/
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static char[] transscript(final char[] code) {
|
|
|
|
|
if (code[1] == '#') {
|
|
|
|
|
if (code[2] == 'x' || code[2] == 'X') {
|
|
|
|
|
return new char[] {(char) Integer.parseInt((new String(code)).substring(3, code.length - 1), 16)};
|
|
|
|
|
}
|
|
|
|
|
return new char[] {(char) Integer.parseInt((new String(code)).substring(2, code.length - 1))};
|
|
|
|
|
}
|
|
|
|
|
return new char[0]; /*
|
|
|
|
|
final String t = trans.get(new String(code));
|
|
|
|
|
if (t == null) return new char[0];
|
|
|
|
|
return t.toCharArray();
|
|
|
|
|
*/
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected static serverCharBuffer transscriptAll(serverCharBuffer bb) {
|
|
|
|
|
int p0 = 0, p1;
|
|
|
|
|
char[] t;
|
|
|
|
|
while ((p0 = bb.indexOf('&', p0)) >= 0) {
|
|
|
|
|
p1 = bb.indexOf(';', p0);
|
|
|
|
|
if (p1 >= 0) {
|
|
|
|
|
t = transscript(bb.getChars(p0, p1 + 1));
|
|
|
|
|
bb = new serverCharBuffer(bb.getChars(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getChars(p1 + 1));
|
|
|
|
|
} else {
|
|
|
|
|
bb = new serverCharBuffer(bb.getChars(0, p0), bb.length()).append(bb.getChars(p0 + 1));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return bb;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected static serverCharBuffer stripAllTags(serverCharBuffer bb) {
|
|
|
|
|
int p0 = 0, p1;
|
|
|
|
|
while ((p0 = bb.indexOf(lb, p0)) >= 0) {
|
|
|
|
|
p1 = bb.indexOf(rb, p0);
|
|
|
|
|
if (p1 >= 0) {
|
|
|
|
|
bb = (new serverCharBuffer(bb.getChars(0, p0), bb.length() + p0 - p1 + 1).trim().append(32)).append(new serverCharBuffer(bb.getChars(p1 + 1)).trim());
|
|
|
|
|
} else {
|
|
|
|
|
bb = new serverCharBuffer(bb.getChars(0, p0), bb.length()).trim().append(new serverCharBuffer(bb.getChars(p0 + 1)).trim());
|
|
|
|
|
protected static String stripAllTags(String s) {
|
|
|
|
|
StringBuffer r = new StringBuffer(s.length());
|
|
|
|
|
int bc = 0;
|
|
|
|
|
char c;
|
|
|
|
|
for (int p = 0; p < s.length(); p++) {
|
|
|
|
|
c = s.charAt(p);
|
|
|
|
|
if (c == lb) {
|
|
|
|
|
bc++;
|
|
|
|
|
r.append(' ');
|
|
|
|
|
} else if (c == rb) {
|
|
|
|
|
bc --;
|
|
|
|
|
} else if (bc <= 0) {
|
|
|
|
|
r.append(c);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return bb.trim();
|
|
|
|
|
return r.toString().trim();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static serverCharBuffer stripAll(final serverCharBuffer bb) {
|
|
|
|
|
//return stripAllTags(s);
|
|
|
|
|
return convertUmlaute(transscriptAll(stripAllTags(bb)));
|
|
|
|
|
public static String stripAll(String s) {
|
|
|
|
|
return htmlFilterCharacterCoding.html2unicode(stripAllTags(s));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void close() {
|
|
|
|
|