diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
index 44b2c28ac..f0c244a7b 100644
--- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
@@ -29,8 +29,6 @@ package de.anomic.htmlFilter;
import java.util.HashSet;
import java.util.Properties;
-import de.anomic.server.serverCharBuffer;
-
public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
public static final char lb = '<';
@@ -40,255 +38,6 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
private HashSet tags0;
private HashSet tags1;
- // define a translation table for html character codings
- /*
- private static HashMap trans = new HashMap(300);
- static {
- trans.put(""", "\""); //Anführungszeichen oben
- trans.put("&", "&"); //Ampersand-Zeichen, kaufmännisches Und
- trans.put("<", "<"); //öffnende spitze Klammer
- trans.put(">", ">"); //schließende spitze Klammer
- trans.put(" ", " "); //Erzwungenes Leerzeichen
- trans.put("¡", "!"); //umgekehrtes Ausrufezeichen
- trans.put("¢", " cent "); //Cent-Zeichen
- trans.put("£", " pound "); //Pfund-Zeichen
- trans.put("¤", " currency "); //Währungs-Zeichen
- trans.put("¥", " yen "); //Yen-Zeichen
- trans.put("¦", " "); //durchbrochener Strich
- trans.put("§", " paragraph "); //Paragraph-Zeichen
- trans.put("¨", " "); //Pünktchen oben
- trans.put("©", " copyright "); //Copyright-Zeichen
- trans.put("ª", " "); //Ordinal-Zeichen weiblich
- trans.put("«", " "); //angewinkelte Anführungszeichen links
- trans.put("¬", " not "); //Verneinungs-Zeichen
- trans.put("", "-"); //kurzer Trennstrich
- trans.put("®", " trademark "); //Registriermarke-Zeichen
- trans.put("¯", " "); //Überstrich
- trans.put("°", " degree "); //Grad-Zeichen
- trans.put("±", " +/- "); //Plusminus-Zeichen
- trans.put("²", " square "); //Hoch-2-Zeichen
- trans.put("³", " 3 "); //Hoch-3-Zeichen
- trans.put("´", " "); //Acute-Zeichen
- trans.put("µ", " micro "); //Mikro-Zeichen
- trans.put("¶", " paragraph "); //Absatz-Zeichen
- trans.put("·", " "); //Mittelpunkt
- trans.put("¸", " "); //Häkchen unten
- trans.put("¹", " "); //Hoch-1-Zeichen
- trans.put("º", " degree "); //Ordinal-Zeichen männlich
- trans.put("»", " "); //angewinkelte Anführungszeichen rechts
- trans.put("¼", " quarter "); //ein Viertel
- trans.put("½", " half "); //ein Halb
- trans.put("¾", " 3/4 "); //drei Viertel
- trans.put("¿", "?"); //umgekehrtes Fragezeichen
- trans.put("À", "A"); //A mit Accent grave
- trans.put("Á", "A"); //A mit Accent acute
- trans.put("Â", "A"); //A mit Circumflex
- trans.put("Ã", "A"); //A mit Tilde
- trans.put("Ä", "Ae"); //A Umlaut
- trans.put("Å", "A"); //A mit Ring
- trans.put("Æ", "A"); //A mit legiertem E
- trans.put("Ç", "C"); //C mit Häkchen
- trans.put("È", "E"); //E mit Accent grave
- trans.put("É", "E"); //E mit Accent acute
- trans.put("Ê", "E"); //E mit Circumflex
- trans.put("Ë", "E"); //E Umlaut
- trans.put("Ì", "I"); //I mit Accent grave
- trans.put("Í", "I"); //I mit Accent acute
- trans.put("Î", "I"); //I mit Circumflex
- trans.put("Ï", "I"); //I Umlaut
- trans.put("Ð", "D"); //Eth (isländisch)
- trans.put("Ñ", "N"); //N mit Tilde
- trans.put("Ò", "O"); //O mit Accent grave
- trans.put("Ó", "O"); //O mit Accent acute
- trans.put("Ô", "O"); //O mit Circumflex
- trans.put("Õ", "O"); //O mit Tilde
- trans.put("Ö", "Oe"); //O Umlaut
- trans.put("×", " times "); //Mal-Zeichen
- trans.put("Ø", "O"); //O mit Schrägstrich
- trans.put("Ù", "U"); //U mit Accent grave
- trans.put("Ú", "U"); //U mit Accent acute
- trans.put("Û", "U"); //U mit Circumflex
- trans.put("Ü", "Ue"); //U Umlaut
- trans.put("Ý", "Y"); //Y mit Accent acute
- trans.put("Þ", "P"); //THORN (isländisch)
- trans.put("ß", "ss"); //scharfes S
- trans.put("à", "a"); //a mit Accent grave
- trans.put("á", "a"); //a mit Accent acute
- trans.put("â", "a"); //a mit Circumflex
- trans.put("ã", "a"); //a mit Tilde
- trans.put("ä", "ae"); //a Umlaut
- trans.put("å", "a"); //a mit Ring
- trans.put("æ", "a"); //a mit legiertem e
- trans.put("ç", "c"); //c mit Häkchen
- trans.put("è", "e"); //e mit Accent grave
- trans.put("é", "e"); //e mit Accent acute
- trans.put("ê", "e"); //e mit Circumflex
- trans.put("ë", "e"); //e Umlaut
- trans.put("ì", "i"); //i mit Accent grave
- trans.put("í", "i"); //i mit Accent acute
- trans.put("î", "i"); //i mit Circumflex
- trans.put("ï", "i"); //i Umlaut
- trans.put("ð", "d"); //eth (isländisch)
- trans.put("ñ", "n"); //n mit Tilde
- trans.put("ò", "o"); //o mit Accent grave
- trans.put("ó", "o"); //o mit Accent acute
- trans.put("ô", "o"); //o mit Circumflex
- trans.put("õ", "o"); //o mit Tilde
- trans.put("ö", "oe"); //o Umlaut
- trans.put("÷", "%"); //Divisions-Zeichen
- trans.put("ø", "o"); //o mit Schrägstrich
- trans.put("ù", "u"); //u mit Accent grave
- trans.put("ú", "u"); //u mit Accent acute
- trans.put("û", "u"); //u mit Circumflex
- trans.put("ü", "ue"); //u Umlaut
- trans.put("ý", "y"); //y mit Accent acute
- trans.put("þ", "p"); //thorn (isländisch)
- trans.put("ÿ", "y"); //y Umlaut
- trans.put("Α", " Alpha "); //Alpha groß
- trans.put("α", " alpha "); //alpha klein
- trans.put("Β", " Beta "); //Beta groß
- trans.put("β", " beta "); //beta klein
- trans.put("Γ", " Gamma "); //Gamma groß
- trans.put("γ", " gamma "); //gamma klein
- trans.put("Δ", " Delta "); //Delta groß
- trans.put("δ", " delta "); //delta klein
- trans.put("Ε", " Epsilon "); //Epsilon groß
- trans.put("ε", " epsilon "); //epsilon klein
- trans.put("Ζ", " Zeta "); //Zeta groß
- trans.put("ζ", " zeta "); //zeta klein
- trans.put("Η", " Eta "); //Eta groß
- trans.put("η", " eta "); //eta klein
- trans.put("Θ", " Theta "); //Theta groß
- trans.put("θ", " theta "); //theta klein
- trans.put("Ι", " Iota "); //Iota groß
- trans.put("ι", " iota "); //iota klein
- trans.put("Κ", " Kappa "); //Kappa groß
- trans.put("κ", " kappa "); //kappa klein
- trans.put("Λ", " Lambda "); //Lambda groß
- trans.put("λ", " lambda "); //lambda klein
- trans.put("Μ", " Mu "); //Mu groß
- trans.put("μ", " mu "); //mu klein
- trans.put("Ν", " Nu "); //Nu groß
- trans.put("ν", " nu "); //nu klein
- trans.put("Ξ", " Xi "); //Xi groß
- trans.put("ξ", " xi "); //xi klein
- trans.put("Ο", " Omicron "); //Omicron groß
- trans.put("ο", " omicron "); //omicron klein
- trans.put("Π", " Pi "); //Pi groß
- trans.put("π", " pi "); //pi klein
- trans.put("Ρ", " Rho "); //Rho groß
- trans.put("ρ", " rho "); //rho klein
- trans.put("Σ", " Sigma "); //Sigma groß
- trans.put("ς", " sigma "); //sigmaf klein
- trans.put("σ", " sigma "); //sigma klein
- trans.put("Τ", " Tau "); //Tau groß
- trans.put("τ", " tau "); //tau klein
- trans.put("Υ", " Ypsilon "); //Upsilon groß
- trans.put("υ", " ypsilon "); //upsilon klein
- trans.put("Φ", " Phi "); //Phi groß
- trans.put("φ", " phi "); //phi klein
- trans.put("Χ", " Chi "); //Chi groß
- trans.put("χ", " chi "); //chi klein
- trans.put("Ψ", " Psi "); //Psi groß
- trans.put("ψ", " psi "); //psi klein
- trans.put("Ω", " Omega "); //Omega groß
- trans.put("ω", " omega "); //omega klein
- trans.put("ϑ", " theta "); //theta Symbol
- trans.put("ϒ", " ypsilon "); //upsilon mit Haken
- trans.put("ϖ", " pi "); //pi Symbol
- trans.put("∀", " for all "); //für alle
- trans.put("∂", " part of "); //teilweise
- trans.put("∃", " exists "); //existiert
- trans.put("∅", " null "); //leer
- trans.put("∇", " nabla "); //nabla
- trans.put("∈", " element of "); //Element von
- trans.put("∉", " not element of "); //kein Element von
- trans.put("∋", " contains "); //enthält als Element
- trans.put("∏", " product "); //Produkt
- trans.put("∑", " sum "); //Summe
- trans.put("−", " minus "); //minus
- trans.put("∗", " times "); //Asterisk
- trans.put("√", " sqare root "); //Quadratwurzel
- trans.put("∝", " proportional to "); //proportional zu
- trans.put("∞", " unlimited "); //unendlich
- trans.put("∠", " angle "); //Winkel
- trans.put("∧", " and "); //und
- trans.put("∨", " or "); //oder
- trans.put("∩", " "); //Schnittpunkt
- trans.put("∪", " unity "); //Einheit
- trans.put("∫", " integral "); //Integral
- trans.put("∴", " cause "); //deshalb
- trans.put("∼", " similar to "); //ähnlich wie
- trans.put("≅", " equal "); //annähernd gleich
- trans.put("≈", " equal "); //beinahe gleich
- trans.put("≠", " not equal "); //ungleich
- trans.put("≡", " identical "); //identisch mit
- trans.put("≤", " smaller or equal than "); //kleiner gleich
- trans.put("≥", " greater or equal than "); //größer gleich
- trans.put("⊂", " subset of "); //Untermenge von
- trans.put("⊃", " superset of "); //Obermenge von
- trans.put("⊄", " not subset of "); //keine Untermenge von
- trans.put("⊆", ""); //Untermenge von oder gleich mit
- trans.put("⊇", ""); //Obermenge von oder gleich mit
- trans.put("⊕", ""); //Direktsumme
- trans.put("⊗", ""); //Vektorprodukt
- trans.put("⊥", ""); //senkrecht zu
- trans.put("⋅", ""); //Punkt-Operator
- trans.put("◊", ""); //Raute
- trans.put("⌈", ""); //links oben
- trans.put("⌉", ""); //rechts oben
- trans.put("⌊", ""); //links unten
- trans.put("⌋", ""); //rechts unten
- trans.put("〈", ""); //spitze Klammer links
- trans.put("〉", ""); //spitze Klammer rechts
- trans.put("←", ""); //Pfeil links
- trans.put("↑", ""); //Pfeil oben
- trans.put("→", ""); //Pfeil rechts
- trans.put("↓", ""); //Pfeil unten
- trans.put("↔", ""); //Pfeil links/rechts
- trans.put("↵", ""); //Pfeil unten-Knick-links
- trans.put("⇐", ""); //Doppelpfeil links
- trans.put("⇑", ""); //Doppelpfeil oben
- trans.put("⇒", ""); //Doppelpfeil rechts
- trans.put("⇓", ""); //Doppelpfeil unten
- trans.put("⇔", ""); //Doppelpfeil links/rechts
- trans.put("•", ""); //Bullet-Zeichen
- trans.put("…", ""); //Horizontale Ellipse
- trans.put("′", ""); //Minutenzeichen
- trans.put("‾", ""); //Überstrich
- trans.put("⁄", ""); //Bruchstrich
- trans.put("℘", ""); //Weierstrass p
- trans.put("ℑ", ""); //Zeichen für "imaginär"
- trans.put("ℜ", ""); //Zeichen für "real"
- trans.put("™", ""); //Trademark-Zeichen
- trans.put("€", ""); //Euro-Zeichen
- trans.put("ℵ", ""); //Alef-Symbol
- trans.put("♠", ""); //Pik-Zeichen
- trans.put("♣", ""); //Kreuz-Zeichen
- trans.put("♥", ""); //Herz-Zeichen
- trans.put("♦", ""); //Karo-Zeichen
- trans.put(" ", ""); //Leerzeichen Breite n
- trans.put(" ", ""); //Leerzeichen Breite m
- trans.put(" ", ""); //Schmales Leerzeichen
- trans.put("", ""); //null breiter Nichtverbinder
- trans.put("", ""); //null breiter Verbinder
- trans.put("", ""); //links-nach-rechts-Zeichen
- trans.put("", ""); //rechts-nach-links-Zeichen
- trans.put("–", ""); //Gedankenstrich Breite n
- trans.put("—", ""); //Gedankenstrich Breite m
- trans.put("‘", ""); //einfaches Anführungszeichen links
- trans.put("’", ""); //einfaches Anführungszeichen rechts
- trans.put("‚", ""); //einfaches low-9-Zeichen
- trans.put("“", ""); //doppeltes Anführungszeichen links
- trans.put("”", ""); //doppeltes Anführungszeichen rechts
- trans.put("„", ""); //doppeltes low-9-Zeichen rechts
- trans.put("†", ""); //Kreuz
- trans.put("‡", ""); //Doppelkreuz
- trans.put("‰", ""); //zu tausend
- trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links
- trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts
- }
-*/
/**
* create a scraper. the tag sets must contain tags in lowercase!
* @param tags0
@@ -315,111 +64,26 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text);
- // string conversions
- /*
- private static String code_iso8859s(final char c) {
- switch (c) {
-
- // german umlaute and ligaturen
- case 0xc4: return "AE"; case 0xd6: return "OE"; case 0xdc: return "UE";
- case 0xe4: return "ae"; case 0xf6: return "oe"; case 0xfc: return "ue";
- case 0xdf: return "ss";
-
- // accent on letters; i.e. french characters
- case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc5: return "A";
- case 0xc6: return "AE";
- case 0xc7: return "C";
- case 0xc8: case 0xc9: case 0xca: return "E";
- case 0xcc: case 0xcd: case 0xce: case 0xcf: return "I";
- case 0xd0: return "D";
- case 0xd1: return "N";
- case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd8: return "O";
- case 0xd7: return "x";
- case 0xd9: case 0xda: case 0xdb: return "U";
- case 0xdd: return "Y";
- case 0xde: return "p";
-
- case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe5: return "a";
- case 0xe6: return "ae";
- case 0xe7: return "c";
- case 0xe8: case 0xe9: case 0xea: return "e";
- case 0xec: case 0xed: case 0xee: case 0xef: return "i";
- case 0xf0: return "d";
- case 0xf1: return "n";
- case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf8: return "o";
- case 0xf7: return "%";
- case 0xf9: case 0xfa: case 0xfb: return "u";
- case 0xfd: case 0xff: return "y";
- case 0xfe: return "p";
-
- // special characters
- case 0xa4: return " euro ";
- default: return null;
- }
- }
-
- */
- public static serverCharBuffer convertUmlaute(final serverCharBuffer bb) {
- return bb; /*
- if (bb.length() == 0) return bb;
-
- final serverCharBuffer t = new serverCharBuffer(bb.length() + 20);
- char c;
- for (int i = 0; i < bb.length(); i++) {
- c = bb.charAt(i);
- final String z = code_iso8859s(c);
- if (z == null) t.append((int)c);
- else t.append(z);
- }
- return t;
- */
- }
-
- private static char[] transscript(final char[] code) {
- if (code[1] == '#') {
- if (code[2] == 'x' || code[2] == 'X') {
- return new char[] {(char) Integer.parseInt((new String(code)).substring(3, code.length - 1), 16)};
- }
- return new char[] {(char) Integer.parseInt((new String(code)).substring(2, code.length - 1))};
- }
- return new char[0]; /*
- final String t = trans.get(new String(code));
- if (t == null) return new char[0];
- return t.toCharArray();
- */
- }
-
- protected static serverCharBuffer transscriptAll(serverCharBuffer bb) {
- int p0 = 0, p1;
- char[] t;
- while ((p0 = bb.indexOf('&', p0)) >= 0) {
- p1 = bb.indexOf(';', p0);
- if (p1 >= 0) {
- t = transscript(bb.getChars(p0, p1 + 1));
- bb = new serverCharBuffer(bb.getChars(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getChars(p1 + 1));
- } else {
- bb = new serverCharBuffer(bb.getChars(0, p0), bb.length()).append(bb.getChars(p0 + 1));
- }
- }
- return bb;
- }
-
- protected static serverCharBuffer stripAllTags(serverCharBuffer bb) {
- int p0 = 0, p1;
- while ((p0 = bb.indexOf(lb, p0)) >= 0) {
- p1 = bb.indexOf(rb, p0);
- if (p1 >= 0) {
- bb = (new serverCharBuffer(bb.getChars(0, p0), bb.length() + p0 - p1 + 1).trim().append(32)).append(new serverCharBuffer(bb.getChars(p1 + 1)).trim());
- } else {
- bb = new serverCharBuffer(bb.getChars(0, p0), bb.length()).trim().append(new serverCharBuffer(bb.getChars(p0 + 1)).trim());
+ protected static String stripAllTags(String s) {
+ StringBuffer r = new StringBuffer(s.length());
+ int bc = 0;
+ char c;
+ for (int p = 0; p < s.length(); p++) {
+ c = s.charAt(p);
+ if (c == lb) {
+ bc++;
+ r.append(' ');
+ } else if (c == rb) {
+ bc --;
+ } else if (bc <= 0) {
+ r.append(c);
}
}
- return bb.trim();
+ return r.toString().trim();
}
- public static serverCharBuffer stripAll(final serverCharBuffer bb) {
- //return stripAllTags(s);
- return convertUmlaute(transscriptAll(stripAllTags(bb)));
+ public static String stripAll(String s) {
+ return htmlFilterCharacterCoding.html2unicode(stripAllTags(s));
}
public void close() {
diff --git a/source/de/anomic/htmlFilter/htmlFilterCharacterCoding.java b/source/de/anomic/htmlFilter/htmlFilterCharacterCoding.java
index e9575344f..430df4c4a 100644
--- a/source/de/anomic/htmlFilter/htmlFilterCharacterCoding.java
+++ b/source/de/anomic/htmlFilter/htmlFilterCharacterCoding.java
@@ -257,6 +257,14 @@ public class htmlFilterCharacterCoding {
sb.append(r);
continue;
}
+ if (s.charAt(1) == '#') {
+ if (s.charAt(2) == 'x' || s.charAt(2) == 'X') {
+ sb.append(new char[] {(char) Integer.parseInt(s.substring(3, s.length() - 1), 16)});
+ continue;
+ }
+ sb.append(new char[] {(char) Integer.parseInt(s.substring(2, s.length() - 1))});
+ continue;
+ }
// the entity is unknown, skip it
}
return new String(sb);
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index e47e7124b..fa0d0d14e 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -117,14 +117,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
public void scrapeText(final char[] newtext, final String insideTag) {
// System.out.println("SCRAPE: " + new String(newtext));
- final serverCharBuffer b = super.stripAll(new serverCharBuffer(newtext, newtext.length + 1)).trim();
+ String b = super.stripAll(new String(newtext)).trim();
if ((insideTag != null) && (!(insideTag.equals("a")))) {
// texts inside tags sometimes have no punctuation at the line end
// this is bad for the text sematics, because it is not possible for the
// condenser to distinguish headlines from text beginnings.
// to make it easier for the condenser, a dot ('.') is appended in case that
// no punctuation is part of the newtext line
- if ((b.length() != 0) && (!(punctuation(b.charAt(b.length() - 1))))) b.append((int) '.');
+ if ((b.length() != 0) && (!(punctuation(b.charAt(b.length() - 1))))) b = b + '.';
//System.out.println("*** Appended dot: " + b.toString());
}
if (b.length() != 0) content.append(b).append(32);
@@ -222,32 +222,32 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
final String type = (p < 0) ? "" : f.substring(p + 1);
if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg")) {
// special handling of such urls: put them to the image urls
- final htmlFilterImageEntry ie = new htmlFilterImageEntry(url, super.stripAll(new serverCharBuffer(text)).trim().toString(), -1, -1);
+ final htmlFilterImageEntry ie = new htmlFilterImageEntry(url, super.stripAll(new String(text)).trim(), -1, -1);
addImage(images, ie);
} else {
- anchors.put(url, super.stripAll(new serverCharBuffer(text)).trim().toString());
+ anchors.put(url, super.stripAll(new String(text)).trim());
}
}
}
String h;
if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
- h = cleanLine(super.stripAll(new serverCharBuffer(text)).toString());
+ h = cleanLine(super.stripAll(new String(text)));
if (h.length() > 0) headlines[0].add(h);
}
if ((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) {
- h = cleanLine(super.stripAll(new serverCharBuffer(text)).toString());
+ h = cleanLine(super.stripAll(new String(text)));
if (h.length() > 0) headlines[1].add(h);
}
if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) {
- h = cleanLine(super.stripAll(new serverCharBuffer(text)).toString());
+ h = cleanLine(super.stripAll(new String(text)));
if (h.length() > 0) headlines[2].add(h);
}
if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) {
- h = cleanLine(super.stripAll(new serverCharBuffer(text)).toString());
+ h = cleanLine(super.stripAll(new String(text)));
if (h.length() > 0) headlines[3].add(h);
}
if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
- title = cleanLine(super.stripAll(new serverCharBuffer(text)).toString());
+ title = cleanLine(super.stripAll(new String(text)));
}
// fire event
@@ -255,26 +255,16 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
}
private static String cleanLine(String s) {
- /*
// may contain too many funny symbols
for (int i = 0; i < s.length(); i++)
if (s.charAt(i) < ' ') s = s.substring(0, i) + " " + s.substring(i + 1);
- */
- int p;
-
- // CR/LF entfernen, dabei koennen doppelte Leerzeichen enstehen die aber weiter unten entfernt werden - thq
- while ((p = s.indexOf("\n")) >= 0) s = s.substring(0, p) + ((p + 1 == s.length()) ? "" : " " + s.substring(p + 1));
-
// remove double-spaces
+ int p;
while ((p = s.indexOf(" ")) >= 0) s = s.substring(0, p) + s.substring(p + 1);
- // we don't accept headlines that are too short
- s = s.trim();
- if (s.length() < 4) s = "";
-
// return result
- return s;
+ return s.trim();
}
public String getTitle() {
diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java
index d1ecd2c95..6fe2ea0f5 100644
--- a/source/de/anomic/plasma/parser/rss/rssParser.java
+++ b/source/de/anomic/plasma/parser/rss/rssParser.java
@@ -118,9 +118,7 @@ public class rssParser extends AbstractParser implements Parser {
anchors.put(itemURL, itemTitle);
if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append((byte) 32);
- serverCharBuffer scb = new serverCharBuffer(htmlFilterAbstractScraper.stripAll(new serverCharBuffer(itemDescr.toCharArray())));
- text.append(scb.trim().toString()).append(' ');
- scb.close();
+ text.append(htmlFilterAbstractScraper.stripAll(itemDescr).trim()).append(' ');
final String itemContent = item.getDescription();
if ((itemContent != null) && (itemContent.length() > 0)) {
diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java
index a5e529f3c..50c40f0b8 100644
--- a/source/de/anomic/plasma/plasmaSearchQuery.java
+++ b/source/de/anomic/plasma/plasmaSearchQuery.java
@@ -33,7 +33,6 @@ import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroNaturalOrder;
-import de.anomic.server.serverCharBuffer;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL;
@@ -240,7 +239,7 @@ public final class plasmaSearchQuery {
if ((querystring == null) || (querystring.length() == 0)) return new TreeSet[]{new TreeSet(kelondroNaturalOrder.naturalComparator), new TreeSet(kelondroNaturalOrder.naturalComparator)};
// convert Umlaute
- querystring = htmlFilterAbstractScraper.convertUmlaute(new serverCharBuffer(querystring.toCharArray())).toString().toLowerCase().trim();
+ querystring = htmlFilterAbstractScraper.stripAll(querystring).toLowerCase().trim();
int c;
for (int i = 0; i < seps.length(); i++) {
while ((c = querystring.indexOf(seps.charAt(i))) >= 0) { querystring = querystring.substring(0, c) + (((c + 1) < querystring.length()) ? (" " + querystring.substring(c + 1)) : ""); }