diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java index 44b2c28ac..f0c244a7b 100644 --- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java @@ -29,8 +29,6 @@ package de.anomic.htmlFilter; import java.util.HashSet; import java.util.Properties; -import de.anomic.server.serverCharBuffer; - public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { public static final char lb = '<'; @@ -40,255 +38,6 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { private HashSet tags0; private HashSet tags1; - // define a translation table for html character codings - /* - private static HashMap trans = new HashMap(300); - static { - trans.put(""", "\""); //Anführungszeichen oben - trans.put("&", "&"); //Ampersand-Zeichen, kaufmännisches Und - trans.put("<", "<"); //öffnende spitze Klammer - trans.put(">", ">"); //schließende spitze Klammer - trans.put(" ", " "); //Erzwungenes Leerzeichen - trans.put("¡", "!"); //umgekehrtes Ausrufezeichen - trans.put("¢", " cent "); //Cent-Zeichen - trans.put("£", " pound "); //Pfund-Zeichen - trans.put("¤", " currency "); //Währungs-Zeichen - trans.put("¥", " yen "); //Yen-Zeichen - trans.put("¦", " "); //durchbrochener Strich - trans.put("§", " paragraph "); //Paragraph-Zeichen - trans.put("¨", " "); //Pünktchen oben - trans.put("©", " copyright "); //Copyright-Zeichen - trans.put("ª", " "); //Ordinal-Zeichen weiblich - trans.put("«", " "); //angewinkelte Anführungszeichen links - trans.put("¬", " not "); //Verneinungs-Zeichen - trans.put("­", "-"); //kurzer Trennstrich - trans.put("®", " trademark "); //Registriermarke-Zeichen - trans.put("¯", " "); //Überstrich - trans.put("°", " degree "); //Grad-Zeichen - trans.put("±", " +/- "); //Plusminus-Zeichen - trans.put("²", " square "); //Hoch-2-Zeichen - trans.put("³", " 3 "); //Hoch-3-Zeichen - trans.put("´", " "); //Acute-Zeichen - trans.put("µ", " micro "); //Mikro-Zeichen - trans.put("¶", " paragraph "); //Absatz-Zeichen - trans.put("·", " "); //Mittelpunkt - trans.put("¸", " "); //Häkchen unten - trans.put("¹", " "); //Hoch-1-Zeichen - trans.put("º", " degree "); //Ordinal-Zeichen männlich - trans.put("»", " "); //angewinkelte Anführungszeichen rechts - trans.put("¼", " quarter "); //ein Viertel - trans.put("½", " half "); //ein Halb - trans.put("¾", " 3/4 "); //drei Viertel - trans.put("¿", "?"); //umgekehrtes Fragezeichen - trans.put("À", "A"); //A mit Accent grave - trans.put("Á", "A"); //A mit Accent acute - trans.put("Â", "A"); //A mit Circumflex - trans.put("Ã", "A"); //A mit Tilde - trans.put("Ä", "Ae"); //A Umlaut - trans.put("Å", "A"); //A mit Ring - trans.put("Æ", "A"); //A mit legiertem E - trans.put("Ç", "C"); //C mit Häkchen - trans.put("È", "E"); //E mit Accent grave - trans.put("É", "E"); //E mit Accent acute - trans.put("Ê", "E"); //E mit Circumflex - trans.put("Ë", "E"); //E Umlaut - trans.put("Ì", "I"); //I mit Accent grave - trans.put("Í", "I"); //I mit Accent acute - trans.put("Î", "I"); //I mit Circumflex - trans.put("Ï", "I"); //I Umlaut - trans.put("Ð", "D"); //Eth (isländisch) - trans.put("Ñ", "N"); //N mit Tilde - trans.put("Ò", "O"); //O mit Accent grave - trans.put("Ó", "O"); //O mit Accent acute - trans.put("Ô", "O"); //O mit Circumflex - trans.put("Õ", "O"); //O mit Tilde - trans.put("Ö", "Oe"); //O Umlaut - trans.put("×", " times "); //Mal-Zeichen - trans.put("Ø", "O"); //O mit Schrägstrich - trans.put("Ù", "U"); //U mit Accent grave - trans.put("Ú", "U"); //U mit Accent acute - trans.put("Û", "U"); //U mit Circumflex - trans.put("Ü", "Ue"); //U Umlaut - trans.put("Ý", "Y"); //Y mit Accent acute - trans.put("Þ", "P"); //THORN (isländisch) - trans.put("ß", "ss"); //scharfes S - trans.put("à", "a"); //a mit Accent grave - trans.put("á", "a"); //a mit Accent acute - trans.put("â", "a"); //a mit Circumflex - trans.put("ã", "a"); //a mit Tilde - trans.put("ä", "ae"); //a Umlaut - trans.put("å", "a"); //a mit Ring - trans.put("æ", "a"); //a mit legiertem e - trans.put("ç", "c"); //c mit Häkchen - trans.put("è", "e"); //e mit Accent grave - trans.put("é", "e"); //e mit Accent acute - trans.put("ê", "e"); //e mit Circumflex - trans.put("ë", "e"); //e Umlaut - trans.put("ì", "i"); //i mit Accent grave - trans.put("í", "i"); //i mit Accent acute - trans.put("î", "i"); //i mit Circumflex - trans.put("ï", "i"); //i Umlaut - trans.put("ð", "d"); //eth (isländisch) - trans.put("ñ", "n"); //n mit Tilde - trans.put("ò", "o"); //o mit Accent grave - trans.put("ó", "o"); //o mit Accent acute - trans.put("ô", "o"); //o mit Circumflex - trans.put("õ", "o"); //o mit Tilde - trans.put("ö", "oe"); //o Umlaut - trans.put("÷", "%"); //Divisions-Zeichen - trans.put("ø", "o"); //o mit Schrägstrich - trans.put("ù", "u"); //u mit Accent grave - trans.put("ú", "u"); //u mit Accent acute - trans.put("û", "u"); //u mit Circumflex - trans.put("ü", "ue"); //u Umlaut - trans.put("ý", "y"); //y mit Accent acute - trans.put("þ", "p"); //thorn (isländisch) - trans.put("ÿ", "y"); //y Umlaut - trans.put("Α", " Alpha "); //Alpha groß - trans.put("α", " alpha "); //alpha klein - trans.put("Β", " Beta "); //Beta groß - trans.put("β", " beta "); //beta klein - trans.put("Γ", " Gamma "); //Gamma groß - trans.put("γ", " gamma "); //gamma klein - trans.put("Δ", " Delta "); //Delta groß - trans.put("δ", " delta "); //delta klein - trans.put("Ε", " Epsilon "); //Epsilon groß - trans.put("ε", " epsilon "); //epsilon klein - trans.put("Ζ", " Zeta "); //Zeta groß - trans.put("ζ", " zeta "); //zeta klein - trans.put("Η", " Eta "); //Eta groß - trans.put("η", " eta "); //eta klein - trans.put("Θ", " Theta "); //Theta groß - trans.put("θ", " theta "); //theta klein - trans.put("Ι", " Iota "); //Iota groß - trans.put("ι", " iota "); //iota klein - trans.put("Κ", " Kappa "); //Kappa groß - trans.put("κ", " kappa "); //kappa klein - trans.put("Λ", " Lambda "); //Lambda groß - trans.put("λ", " lambda "); //lambda klein - trans.put("Μ", " Mu "); //Mu groß - trans.put("μ", " mu "); //mu klein - trans.put("Ν", " Nu "); //Nu groß - trans.put("ν", " nu "); //nu klein - trans.put("Ξ", " Xi "); //Xi groß - trans.put("ξ", " xi "); //xi klein - trans.put("Ο", " Omicron "); //Omicron groß - trans.put("ο", " omicron "); //omicron klein - trans.put("Π", " Pi "); //Pi groß - trans.put("π", " pi "); //pi klein - trans.put("Ρ", " Rho "); //Rho groß - trans.put("ρ", " rho "); //rho klein - trans.put("Σ", " Sigma "); //Sigma groß - trans.put("ς", " sigma "); //sigmaf klein - trans.put("σ", " sigma "); //sigma klein - trans.put("Τ", " Tau "); //Tau groß - trans.put("τ", " tau "); //tau klein - trans.put("Υ", " Ypsilon "); //Upsilon groß - trans.put("υ", " ypsilon "); //upsilon klein - trans.put("Φ", " Phi "); //Phi groß - trans.put("φ", " phi "); //phi klein - trans.put("Χ", " Chi "); //Chi groß - trans.put("χ", " chi "); //chi klein - trans.put("Ψ", " Psi "); //Psi groß - trans.put("ψ", " psi "); //psi klein - trans.put("Ω", " Omega "); //Omega groß - trans.put("ω", " omega "); //omega klein - trans.put("ϑ", " theta "); //theta Symbol - trans.put("ϒ", " ypsilon "); //upsilon mit Haken - trans.put("ϖ", " pi "); //pi Symbol - trans.put("∀", " for all "); //für alle - trans.put("∂", " part of "); //teilweise - trans.put("∃", " exists "); //existiert - trans.put("∅", " null "); //leer - trans.put("∇", " nabla "); //nabla - trans.put("∈", " element of "); //Element von - trans.put("∉", " not element of "); //kein Element von - trans.put("∋", " contains "); //enthält als Element - trans.put("∏", " product "); //Produkt - trans.put("∑", " sum "); //Summe - trans.put("−", " minus "); //minus - trans.put("∗", " times "); //Asterisk - trans.put("√", " sqare root "); //Quadratwurzel - trans.put("∝", " proportional to "); //proportional zu - trans.put("∞", " unlimited "); //unendlich - trans.put("∠", " angle "); //Winkel - trans.put("∧", " and "); //und - trans.put("∨", " or "); //oder - trans.put("∩", " "); //Schnittpunkt - trans.put("∪", " unity "); //Einheit - trans.put("∫", " integral "); //Integral - trans.put("∴", " cause "); //deshalb - trans.put("∼", " similar to "); //ähnlich wie - trans.put("≅", " equal "); //annähernd gleich - trans.put("≈", " equal "); //beinahe gleich - trans.put("≠", " not equal "); //ungleich - trans.put("≡", " identical "); //identisch mit - trans.put("≤", " smaller or equal than "); //kleiner gleich - trans.put("≥", " greater or equal than "); //größer gleich - trans.put("⊂", " subset of "); //Untermenge von - trans.put("⊃", " superset of "); //Obermenge von - trans.put("⊄", " not subset of "); //keine Untermenge von - trans.put("⊆", ""); //Untermenge von oder gleich mit - trans.put("⊇", ""); //Obermenge von oder gleich mit - trans.put("⊕", ""); //Direktsumme - trans.put("⊗", ""); //Vektorprodukt - trans.put("⊥", ""); //senkrecht zu - trans.put("⋅", ""); //Punkt-Operator - trans.put("◊", ""); //Raute - trans.put("⌈", ""); //links oben - trans.put("⌉", ""); //rechts oben - trans.put("⌊", ""); //links unten - trans.put("⌋", ""); //rechts unten - trans.put("⟨", ""); //spitze Klammer links - trans.put("⟩", ""); //spitze Klammer rechts - trans.put("←", ""); //Pfeil links - trans.put("↑", ""); //Pfeil oben - trans.put("→", ""); //Pfeil rechts - trans.put("↓", ""); //Pfeil unten - trans.put("↔", ""); //Pfeil links/rechts - trans.put("↵", ""); //Pfeil unten-Knick-links - trans.put("⇐", ""); //Doppelpfeil links - trans.put("⇑", ""); //Doppelpfeil oben - trans.put("⇒", ""); //Doppelpfeil rechts - trans.put("⇓", ""); //Doppelpfeil unten - trans.put("⇔", ""); //Doppelpfeil links/rechts - trans.put("•", ""); //Bullet-Zeichen - trans.put("…", ""); //Horizontale Ellipse - trans.put("′", ""); //Minutenzeichen - trans.put("‾", ""); //Überstrich - trans.put("⁄", ""); //Bruchstrich - trans.put("℘", ""); //Weierstrass p - trans.put("ℑ", ""); //Zeichen für "imaginär" - trans.put("ℜ", ""); //Zeichen für "real" - trans.put("™", ""); //Trademark-Zeichen - trans.put("€", ""); //Euro-Zeichen - trans.put("ℵ", ""); //Alef-Symbol - trans.put("♠", ""); //Pik-Zeichen - trans.put("♣", ""); //Kreuz-Zeichen - trans.put("♥", ""); //Herz-Zeichen - trans.put("♦", ""); //Karo-Zeichen - trans.put(" ", ""); //Leerzeichen Breite n - trans.put(" ", ""); //Leerzeichen Breite m - trans.put(" ", ""); //Schmales Leerzeichen - trans.put("‌", ""); //null breiter Nichtverbinder - trans.put("‍", ""); //null breiter Verbinder - trans.put("‎", ""); //links-nach-rechts-Zeichen - trans.put("‏", ""); //rechts-nach-links-Zeichen - trans.put("–", ""); //Gedankenstrich Breite n - trans.put("—", ""); //Gedankenstrich Breite m - trans.put("‘", ""); //einfaches Anführungszeichen links - trans.put("’", ""); //einfaches Anführungszeichen rechts - trans.put("‚", ""); //einfaches low-9-Zeichen - trans.put("“", ""); //doppeltes Anführungszeichen links - trans.put("”", ""); //doppeltes Anführungszeichen rechts - trans.put("„", ""); //doppeltes low-9-Zeichen rechts - trans.put("†", ""); //Kreuz - trans.put("‡", ""); //Doppelkreuz - trans.put("‰", ""); //zu tausend - trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links - trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts - } -*/ /** * create a scraper. the tag sets must contain tags in lowercase! * @param tags0 @@ -315,111 +64,26 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text); - // string conversions - /* - private static String code_iso8859s(final char c) { - switch (c) { - - // german umlaute and ligaturen - case 0xc4: return "AE"; case 0xd6: return "OE"; case 0xdc: return "UE"; - case 0xe4: return "ae"; case 0xf6: return "oe"; case 0xfc: return "ue"; - case 0xdf: return "ss"; - - // accent on letters; i.e. french characters - case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc5: return "A"; - case 0xc6: return "AE"; - case 0xc7: return "C"; - case 0xc8: case 0xc9: case 0xca: return "E"; - case 0xcc: case 0xcd: case 0xce: case 0xcf: return "I"; - case 0xd0: return "D"; - case 0xd1: return "N"; - case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd8: return "O"; - case 0xd7: return "x"; - case 0xd9: case 0xda: case 0xdb: return "U"; - case 0xdd: return "Y"; - case 0xde: return "p"; - - case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe5: return "a"; - case 0xe6: return "ae"; - case 0xe7: return "c"; - case 0xe8: case 0xe9: case 0xea: return "e"; - case 0xec: case 0xed: case 0xee: case 0xef: return "i"; - case 0xf0: return "d"; - case 0xf1: return "n"; - case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf8: return "o"; - case 0xf7: return "%"; - case 0xf9: case 0xfa: case 0xfb: return "u"; - case 0xfd: case 0xff: return "y"; - case 0xfe: return "p"; - - // special characters - case 0xa4: return " euro "; - default: return null; - } - } - - */ - public static serverCharBuffer convertUmlaute(final serverCharBuffer bb) { - return bb; /* - if (bb.length() == 0) return bb; - - final serverCharBuffer t = new serverCharBuffer(bb.length() + 20); - char c; - for (int i = 0; i < bb.length(); i++) { - c = bb.charAt(i); - final String z = code_iso8859s(c); - if (z == null) t.append((int)c); - else t.append(z); - } - return t; - */ - } - - private static char[] transscript(final char[] code) { - if (code[1] == '#') { - if (code[2] == 'x' || code[2] == 'X') { - return new char[] {(char) Integer.parseInt((new String(code)).substring(3, code.length - 1), 16)}; - } - return new char[] {(char) Integer.parseInt((new String(code)).substring(2, code.length - 1))}; - } - return new char[0]; /* - final String t = trans.get(new String(code)); - if (t == null) return new char[0]; - return t.toCharArray(); - */ - } - - protected static serverCharBuffer transscriptAll(serverCharBuffer bb) { - int p0 = 0, p1; - char[] t; - while ((p0 = bb.indexOf('&', p0)) >= 0) { - p1 = bb.indexOf(';', p0); - if (p1 >= 0) { - t = transscript(bb.getChars(p0, p1 + 1)); - bb = new serverCharBuffer(bb.getChars(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getChars(p1 + 1)); - } else { - bb = new serverCharBuffer(bb.getChars(0, p0), bb.length()).append(bb.getChars(p0 + 1)); - } - } - return bb; - } - - protected static serverCharBuffer stripAllTags(serverCharBuffer bb) { - int p0 = 0, p1; - while ((p0 = bb.indexOf(lb, p0)) >= 0) { - p1 = bb.indexOf(rb, p0); - if (p1 >= 0) { - bb = (new serverCharBuffer(bb.getChars(0, p0), bb.length() + p0 - p1 + 1).trim().append(32)).append(new serverCharBuffer(bb.getChars(p1 + 1)).trim()); - } else { - bb = new serverCharBuffer(bb.getChars(0, p0), bb.length()).trim().append(new serverCharBuffer(bb.getChars(p0 + 1)).trim()); + protected static String stripAllTags(String s) { + StringBuffer r = new StringBuffer(s.length()); + int bc = 0; + char c; + for (int p = 0; p < s.length(); p++) { + c = s.charAt(p); + if (c == lb) { + bc++; + r.append(' '); + } else if (c == rb) { + bc --; + } else if (bc <= 0) { + r.append(c); } } - return bb.trim(); + return r.toString().trim(); } - public static serverCharBuffer stripAll(final serverCharBuffer bb) { - //return stripAllTags(s); - return convertUmlaute(transscriptAll(stripAllTags(bb))); + public static String stripAll(String s) { + return htmlFilterCharacterCoding.html2unicode(stripAllTags(s)); } public void close() { diff --git a/source/de/anomic/htmlFilter/htmlFilterCharacterCoding.java b/source/de/anomic/htmlFilter/htmlFilterCharacterCoding.java index e9575344f..430df4c4a 100644 --- a/source/de/anomic/htmlFilter/htmlFilterCharacterCoding.java +++ b/source/de/anomic/htmlFilter/htmlFilterCharacterCoding.java @@ -257,6 +257,14 @@ public class htmlFilterCharacterCoding { sb.append(r); continue; } + if (s.charAt(1) == '#') { + if (s.charAt(2) == 'x' || s.charAt(2) == 'X') { + sb.append(new char[] {(char) Integer.parseInt(s.substring(3, s.length() - 1), 16)}); + continue; + } + sb.append(new char[] {(char) Integer.parseInt(s.substring(2, s.length() - 1))}); + continue; + } // the entity is unknown, skip it } return new String(sb); diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index e47e7124b..fa0d0d14e 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -117,14 +117,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen public void scrapeText(final char[] newtext, final String insideTag) { // System.out.println("SCRAPE: " + new String(newtext)); - final serverCharBuffer b = super.stripAll(new serverCharBuffer(newtext, newtext.length + 1)).trim(); + String b = super.stripAll(new String(newtext)).trim(); if ((insideTag != null) && (!(insideTag.equals("a")))) { // texts inside tags sometimes have no punctuation at the line end // this is bad for the text sematics, because it is not possible for the // condenser to distinguish headlines from text beginnings. // to make it easier for the condenser, a dot ('.') is appended in case that // no punctuation is part of the newtext line - if ((b.length() != 0) && (!(punctuation(b.charAt(b.length() - 1))))) b.append((int) '.'); + if ((b.length() != 0) && (!(punctuation(b.charAt(b.length() - 1))))) b = b + '.'; //System.out.println("*** Appended dot: " + b.toString()); } if (b.length() != 0) content.append(b).append(32); @@ -222,32 +222,32 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen final String type = (p < 0) ? "" : f.substring(p + 1); if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg")) { // special handling of such urls: put them to the image urls - final htmlFilterImageEntry ie = new htmlFilterImageEntry(url, super.stripAll(new serverCharBuffer(text)).trim().toString(), -1, -1); + final htmlFilterImageEntry ie = new htmlFilterImageEntry(url, super.stripAll(new String(text)).trim(), -1, -1); addImage(images, ie); } else { - anchors.put(url, super.stripAll(new serverCharBuffer(text)).trim().toString()); + anchors.put(url, super.stripAll(new String(text)).trim()); } } } String h; if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) { - h = cleanLine(super.stripAll(new serverCharBuffer(text)).toString()); + h = cleanLine(super.stripAll(new String(text))); if (h.length() > 0) headlines[0].add(h); } if ((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) { - h = cleanLine(super.stripAll(new serverCharBuffer(text)).toString()); + h = cleanLine(super.stripAll(new String(text))); if (h.length() > 0) headlines[1].add(h); } if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) { - h = cleanLine(super.stripAll(new serverCharBuffer(text)).toString()); + h = cleanLine(super.stripAll(new String(text))); if (h.length() > 0) headlines[2].add(h); } if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) { - h = cleanLine(super.stripAll(new serverCharBuffer(text)).toString()); + h = cleanLine(super.stripAll(new String(text))); if (h.length() > 0) headlines[3].add(h); } if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) { - title = cleanLine(super.stripAll(new serverCharBuffer(text)).toString()); + title = cleanLine(super.stripAll(new String(text))); } // fire event @@ -255,26 +255,16 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen } private static String cleanLine(String s) { - /* // may contain too many funny symbols for (int i = 0; i < s.length(); i++) if (s.charAt(i) < ' ') s = s.substring(0, i) + " " + s.substring(i + 1); - */ - int p; - - // CR/LF entfernen, dabei koennen doppelte Leerzeichen enstehen die aber weiter unten entfernt werden - thq - while ((p = s.indexOf("\n")) >= 0) s = s.substring(0, p) + ((p + 1 == s.length()) ? "" : " " + s.substring(p + 1)); - // remove double-spaces + int p; while ((p = s.indexOf(" ")) >= 0) s = s.substring(0, p) + s.substring(p + 1); - // we don't accept headlines that are too short - s = s.trim(); - if (s.length() < 4) s = ""; - // return result - return s; + return s.trim(); } public String getTitle() { diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java index d1ecd2c95..6fe2ea0f5 100644 --- a/source/de/anomic/plasma/parser/rss/rssParser.java +++ b/source/de/anomic/plasma/parser/rss/rssParser.java @@ -118,9 +118,7 @@ public class rssParser extends AbstractParser implements Parser { anchors.put(itemURL, itemTitle); if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append((byte) 32); - serverCharBuffer scb = new serverCharBuffer(htmlFilterAbstractScraper.stripAll(new serverCharBuffer(itemDescr.toCharArray()))); - text.append(scb.trim().toString()).append(' '); - scb.close(); + text.append(htmlFilterAbstractScraper.stripAll(itemDescr).trim()).append(' '); final String itemContent = item.getDescription(); if ((itemContent != null) && (itemContent.length() > 0)) { diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index a5e529f3c..50c40f0b8 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -33,7 +33,6 @@ import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroMSetTools; import de.anomic.kelondro.kelondroNaturalOrder; -import de.anomic.server.serverCharBuffer; import de.anomic.yacy.yacySeed; import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacyURL; @@ -240,7 +239,7 @@ public final class plasmaSearchQuery { if ((querystring == null) || (querystring.length() == 0)) return new TreeSet[]{new TreeSet(kelondroNaturalOrder.naturalComparator), new TreeSet(kelondroNaturalOrder.naturalComparator)}; // convert Umlaute - querystring = htmlFilterAbstractScraper.convertUmlaute(new serverCharBuffer(querystring.toCharArray())).toString().toLowerCase().trim(); + querystring = htmlFilterAbstractScraper.stripAll(querystring).toLowerCase().trim(); int c; for (int i = 0; i < seps.length(); i++) { while ((c = querystring.indexOf(seps.charAt(i))) >= 0) { querystring = querystring.substring(0, c) + (((c + 1) < querystring.length()) ? (" " + querystring.substring(c + 1)) : ""); }