diff --git a/htroot/Settings_p.html b/htroot/Settings_p.html
index 97f4c0c87..7462df4db 100644
--- a/htroot/Settings_p.html
+++ b/htroot/Settings_p.html
@@ -139,8 +139,7 @@ Alternatively, you can simply set a virtual server port on your NAT/Server to en
This is the account that restricts access to the proxy function. You probably don't want to share the proxy to the internet, so you should set the IP-Number Access Domain to a pattern that corresponds to you local intranet. The default setting should be right in most cases. If you want, you can also set a proxy account so that every proxy user must authenticate first, but this is rather unusual.
-IP-Number filter:
+IP-Number filter:
Account Name:
Password:
diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
index 591ca5881..bbf87c0e7 100644
--- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
@@ -1 +1 @@
-// htmlFilterAbstractScraper.java
// ---------------------------
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
// last major change: 18.02.2004
//
// You agree that the Author(s) is (are) not responsible for cost,
// loss of data or any harm that may be caused by usage of this softare or
// this documentation. The usage of this software is on your own risk. The
// installation and usage (starting/running) of this software may allow other
// people or application to access your computer and any attached devices and
// is highly dependent on the configuration of the software which must be
// done by the user of the software;the author(s) is (are) also
// not responsible for proper configuration and usage of the software, even
// if provoked by documentation provided together with the software.
//
// THE SOFTWARE THAT FOLLOWS AS ART OF PROGRAMMING BELOW THIS SECTION
// IS PUBLISHED UNDER THE GPL AS DOCUMENTED IN THE FILE gpl.txt ASIDE THIS
// FILE AND AS IN http://www.gnu.org/licenses/gpl.txt
// ANY CHANGES TO THIS FILE ACCORDING TO THE GPL CAN BE DONE TO THE
// LINES THAT FOLLOWS THIS COPYRIGHT NOTICE HERE, BUT CHANGES MUST NOT
// BE DONE ABOVE OR INSIDE THE COPYRIGHT NOTICE. A RE-DISTRIBUTION
// MUST CONTAIN THE INTACT AND UNCHANGED COPYRIGHT NOTICE.
// CONTRIBUTIONS AND CHANGES TO THE PROGRAM CODE SHOULD BE MARKED AS SUCH.
package de.anomic.htmlFilter;
import java.util.HashSet;
import java.util.HashMap;
import java.util.Properties;
import de.anomic.server.serverByteBuffer;
public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
public static final byte lb = (byte) '<';
public static final byte rb = (byte) '>';
public static final byte sl = (byte) '/';
private HashSet tags0;
private HashSet tags1;
// define a translation table for html character codings
private static HashMap trans = new HashMap(300);
static {
trans.put(""", "\""); //Anführungszeichen oben
trans.put("&", "&"); //Ampersand-Zeichen, kaufmännisches Und
trans.put("<", "<"); //öffnende spitze Klammer
trans.put(">", ">"); //schließende spitze Klammer
trans.put(" ", " "); //Erzwungenes Leerzeichen
trans.put("¡", "!"); //umgekehrtes Ausrufezeichen
trans.put("¢", " cent "); //Cent-Zeichen
trans.put("£", " pound "); //Pfund-Zeichen
trans.put("¤", " currency "); //Währungs-Zeichen
trans.put("¥", " yen "); //Yen-Zeichen
trans.put("¦", " "); //durchbrochener Strich
trans.put("§", " paragraph "); //Paragraph-Zeichen
trans.put("¨", " "); //Pünktchen oben
trans.put("©", " copyright "); //Copyright-Zeichen
trans.put("ª", " "); //Ordinal-Zeichen weiblich
trans.put("«", " "); //angewinkelte Anführungszeichen links
trans.put("¬", " not "); //Verneinungs-Zeichen
trans.put("", "-"); //kurzer Trennstrich
trans.put("®", " trademark "); //Registriermarke-Zeichen
trans.put("¯", " "); //Überstrich
trans.put("°", " degree "); //Grad-Zeichen
trans.put("±", " +/- "); //Plusminus-Zeichen
trans.put("²", " square "); //Hoch-2-Zeichen
trans.put("³", " 3 "); //Hoch-3-Zeichen
trans.put("´", " "); //Acute-Zeichen
trans.put("µ", " micro "); //Mikro-Zeichen
trans.put("¶", " paragraph "); //Absatz-Zeichen
trans.put("·", " "); //Mittelpunkt
trans.put("¸", " "); //Häkchen unten
trans.put("¹", " "); //Hoch-1-Zeichen
trans.put("º", " degree "); //Ordinal-Zeichen männlich
trans.put("»", " "); //angewinkelte Anführungszeichen rechts
trans.put("¼", " quarter "); //ein Viertel
trans.put("½", " half "); //ein Halb
trans.put("¾", " 3/4 "); //drei Viertel
trans.put("¿", "?"); //umgekehrtes Fragezeichen
trans.put("À", "A"); //A mit Accent grave
trans.put("Á", "A"); //A mit Accent acute
trans.put("Â", "A"); //A mit Circumflex
trans.put("Ã", "A"); //A mit Tilde
trans.put("Ä", "Ae"); //A Umlaut
trans.put("Å", "A"); //A mit Ring
trans.put("Æ", "A"); //A mit legiertem E
trans.put("Ç", "C"); //C mit Häkchen
trans.put("È", "E"); //E mit Accent grave
trans.put("É", "E"); //E mit Accent acute
trans.put("Ê", "E"); //E mit Circumflex
trans.put("Ë", "E"); //E Umlaut
trans.put("Ì", "I"); //I mit Accent grave
trans.put("Í", "I"); //I mit Accent acute
trans.put("Î", "I"); //I mit Circumflex
trans.put("Ï", "I"); //I Umlaut
trans.put("Ð", "D"); //Eth (isländisch)
trans.put("Ñ", "N"); //N mit Tilde
trans.put("Ò", "O"); //O mit Accent grave
trans.put("Ó", "O"); //O mit Accent acute
trans.put("Ô", "O"); //O mit Circumflex
trans.put("Õ", "O"); //O mit Tilde
trans.put("Ö", "Oe"); //O Umlaut
trans.put("×", " times "); //Mal-Zeichen
trans.put("Ø", "O"); //O mit Schrägstrich
trans.put("Ù", "U"); //U mit Accent grave
trans.put("Ú", "U"); //U mit Accent acute
trans.put("Û", "U"); //U mit Circumflex
trans.put("Ü", "Ue"); //U Umlaut
trans.put("Ý", "Y"); //Y mit Accent acute
trans.put("Þ", "P"); //THORN (isländisch)
trans.put("ß", "ss"); //scharfes S
trans.put("à", "a"); //a mit Accent grave
trans.put("á", "a"); //a mit Accent acute
trans.put("â", "a"); //a mit Circumflex
trans.put("ã", "a"); //a mit Tilde
trans.put("ä", "ae"); //a Umlaut
trans.put("å", "a"); //a mit Ring
trans.put("æ", "a"); //a mit legiertem e
trans.put("ç", "c"); //c mit Häkchen
trans.put("è", "e"); //e mit Accent grave
trans.put("é", "e"); //e mit Accent acute
trans.put("ê", "e"); //e mit Circumflex
trans.put("ë", "e"); //e Umlaut
trans.put("ì", "i"); //i mit Accent grave
trans.put("í", "i"); //i mit Accent acute
trans.put("î", "i"); //i mit Circumflex
trans.put("ï", "i"); //i Umlaut
trans.put("ð", "d"); //eth (isländisch)
trans.put("ñ", "n"); //n mit Tilde
trans.put("ò", "o"); //o mit Accent grave
trans.put("ó", "o"); //o mit Accent acute
trans.put("ô", "o"); //o mit Circumflex
trans.put("õ", "o"); //o mit Tilde
trans.put("ö", "oe"); //o Umlaut
trans.put("÷", "%"); //Divisions-Zeichen
trans.put("ø", "o"); //o mit Schrägstrich
trans.put("ù", "u"); //u mit Accent grave
trans.put("ú", "u"); //u mit Accent acute
trans.put("û", "u"); //u mit Circumflex
trans.put("ü", "ue"); //u Umlaut
trans.put("ý", "y"); //y mit Accent acute
trans.put("þ", "p"); //thorn (isländisch)
trans.put("ÿ", "y"); //y Umlaut
trans.put("Α", " Alpha "); //Alpha groß
trans.put("α", " alpha "); //alpha klein
trans.put("Β", " Beta "); //Beta groß
trans.put("β", " beta "); //beta klein
trans.put("Γ", " Gamma "); //Gamma groß
trans.put("γ", " gamma "); //gamma klein
trans.put("Δ", " Delta "); //Delta groß
trans.put("δ", " delta "); //delta klein
trans.put("Ε", " Epsilon "); //Epsilon groß
trans.put("ε", " epsilon "); //epsilon klein
trans.put("Ζ", " Zeta "); //Zeta groß
trans.put("ζ", " zeta "); //zeta klein
trans.put("Η", " Eta "); //Eta groß
trans.put("η", " eta "); //eta klein
trans.put("Θ", " Theta "); //Theta groß
trans.put("θ", " theta "); //theta klein
trans.put("Ι", " Iota "); //Iota groß
trans.put("ι", " iota "); //iota klein
trans.put("Κ", " Kappa "); //Kappa groß
trans.put("κ", " kappa "); //kappa klein
trans.put("Λ", " Lambda "); //Lambda groß
trans.put("λ", " lambda "); //lambda klein
trans.put("Μ", " Mu "); //Mu groß
trans.put("μ", " mu "); //mu klein
trans.put("Ν", " Nu "); //Nu groß
trans.put("ν", " nu "); //nu klein
trans.put("Ξ", " Xi "); //Xi groß
trans.put("ξ", " xi "); //xi klein
trans.put("Ο", " Omicron "); //Omicron groß
trans.put("ο", " omicron "); //omicron klein
trans.put("Π", " Pi "); //Pi groß
trans.put("π", " pi "); //pi klein
trans.put("Ρ", " Rho "); //Rho groß
trans.put("ρ", " rho "); //rho klein
trans.put("Σ", " Sigma "); //Sigma groß
trans.put("ς", " sigma "); //sigmaf klein
trans.put("σ", " sigma "); //sigma klein
trans.put("Τ", " Tau "); //Tau groß
trans.put("τ", " tau "); //tau klein
trans.put("Υ", " Ypsilon "); //Upsilon groß
trans.put("υ", " ypsilon "); //upsilon klein
trans.put("Φ", " Phi "); //Phi groß
trans.put("φ", " phi "); //phi klein
trans.put("Χ", " Chi "); //Chi groß
trans.put("χ", " chi "); //chi klein
trans.put("Ψ", " Psi "); //Psi groß
trans.put("ψ", " psi "); //psi klein
trans.put("Ω", " Omega "); //Omega groß
trans.put("ω", " omega "); //omega klein
trans.put("ϑ", " theta "); //theta Symbol
trans.put("ϒ", " ypsilon "); //upsilon mit Haken
trans.put("ϖ", " pi "); //pi Symbol
trans.put("∀", " for all "); //für alle
trans.put("∂", " part of "); //teilweise
trans.put("∃", " exists "); //existiert
trans.put("∅", " null "); //leer
trans.put("∇", " nabla "); //nabla
trans.put("∈", " element of "); //Element von
trans.put("∉", " not element of "); //kein Element von
trans.put("∋", " contains "); //enthält als Element
trans.put("∏", " product "); //Produkt
trans.put("∑", " sum "); //Summe
trans.put("−", " minus "); //minus
trans.put("∗", " times "); //Asterisk
trans.put("√", " sqare root "); //Quadratwurzel
trans.put("∝", " proportional to "); //proportional zu
trans.put("∞", " unlimited "); //unendlich
trans.put("∠", " angle "); //Winkel
trans.put("∧", " and "); //und
trans.put("∨", " or "); //oder
trans.put("∩", " "); //Schnittpunkt
trans.put("∪", " unity "); //Einheit
trans.put("∫", " integral "); //Integral
trans.put("∴", " cause "); //deshalb
trans.put("∼", " similar to "); //ähnlich wie
trans.put("≅", " equal "); //annähernd gleich
trans.put("≈", " equal "); //beinahe gleich
trans.put("≠", " not equal "); //ungleich
trans.put("≡", " identical "); //identisch mit
trans.put("≤", " smaller or equal than "); //kleiner gleich
trans.put("≥", " greater or equal than "); //größer gleich
trans.put("⊂", " subset of "); //Untermenge von
trans.put("⊃", " superset of "); //Obermenge von
trans.put("⊄", " not subset of "); //keine Untermenge von
trans.put("⊆", ""); //Untermenge von oder gleich mit
trans.put("⊇", ""); //Obermenge von oder gleich mit
trans.put("⊕", ""); //Direktsumme
trans.put("⊗", ""); //Vektorprodukt
trans.put("⊥", ""); //senkrecht zu
trans.put("⋅", ""); //Punkt-Operator
trans.put("◊", ""); //Raute
trans.put("⌈", ""); //links oben
trans.put("⌉", ""); //rechts oben
trans.put("⌊", ""); //links unten
trans.put("⌋", ""); //rechts unten
trans.put("〈", ""); //spitze Klammer links
trans.put("〉", ""); //spitze Klammer rechts
trans.put("←", ""); //Pfeil links
trans.put("↑", ""); //Pfeil oben
trans.put("→", ""); //Pfeil rechts
trans.put("↓", ""); //Pfeil unten
trans.put("↔", ""); //Pfeil links/rechts
trans.put("↵", ""); //Pfeil unten-Knick-links
trans.put("⇐", ""); //Doppelpfeil links
trans.put("⇑", ""); //Doppelpfeil oben
trans.put("⇒", ""); //Doppelpfeil rechts
trans.put("⇓", ""); //Doppelpfeil unten
trans.put("⇔", ""); //Doppelpfeil links/rechts
trans.put("•", ""); //Bullet-Zeichen
trans.put("…", ""); //Horizontale Ellipse
trans.put("′", ""); //Minutenzeichen
trans.put("‾", ""); //Überstrich
trans.put("⁄", ""); //Bruchstrich
trans.put("℘", ""); //Weierstrass p
trans.put("ℑ", ""); //Zeichen für "imaginär"
trans.put("ℜ", ""); //Zeichen für "real"
trans.put("™", ""); //Trademark-Zeichen
trans.put("€", ""); //Euro-Zeichen
trans.put("ℵ", ""); //Alef-Symbol
trans.put("♠", ""); //Pik-Zeichen
trans.put("♣", ""); //Kreuz-Zeichen
trans.put("♥", ""); //Herz-Zeichen
trans.put("♦", ""); //Karo-Zeichen
trans.put(" ", ""); //Leerzeichen Breite n
trans.put(" ", ""); //Leerzeichen Breite m
trans.put(" ", ""); //Schmales Leerzeichen
trans.put("", ""); //null breiter Nichtverbinder
trans.put("", ""); //null breiter Verbinder
trans.put("", ""); //links-nach-rechts-Zeichen
trans.put("", ""); //rechts-nach-links-Zeichen
trans.put("–", ""); //Gedankenstrich Breite n
trans.put("—", ""); //Gedankenstrich Breite m
trans.put("‘", ""); //einfaches Anführungszeichen links
trans.put("’", ""); //einfaches Anführungszeichen rechts
trans.put("‚", ""); //einfaches low-9-Zeichen
trans.put("“", ""); //doppeltes Anführungszeichen links
trans.put("”", ""); //doppeltes Anführungszeichen rechts
trans.put("„", ""); //doppeltes low-9-Zeichen rechts
trans.put("†", ""); //Kreuz
trans.put("‡", ""); //Doppelkreuz
trans.put("‰", ""); //zu tausend
trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links
trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts
}
public htmlFilterAbstractScraper(HashSet tags0, HashSet tags1) {
this.tags0 = tags0;
this.tags1 = tags1;
}
public boolean isTag0(String tag) {
return (tags0 != null) && (tags0.contains(tag));
}
public boolean isTag1(String tag) {
return (tags1 != null) && (tags1.contains(tag));
}
//the 'missing' method that shall be implemented:
public abstract void scrapeText(byte[] text);
// the other methods must take into account to construct the return value correctly
public void scrapeTag0(String tagname, Properties tagopts) {
}
public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
}
// string conversions
private static String code_iso8859s(int c) {
switch ((int) c & 0xff) {
// german umlaute and ligaturen
case 0xc4: return "AE"; case 0xd6: return "OE"; case 0xdc: return "UE";
case 0xe4: return "ae"; case 0xf6: return "oe"; case 0xfc: return "ue";
case 0xdf: return "ss";
// accent on letters; i.e. french characters
case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc5: return "A";
case 0xc6: return "AE";
case 0xc7: return "C";
case 0xc8: case 0xc9: case 0xca: return "E";
case 0xcc: case 0xcd: case 0xce: case 0xcf: return "I";
case 0xd0: return "D";
case 0xd1: return "N";
case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd8: return "O";
case 0xd7: return "x";
case 0xd9: case 0xda: case 0xdb: return "U";
case 0xdd: return "Y";
case 0xde: return "p";
case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe5: return "a";
case 0xe6: return "ae";
case 0xe7: return "c";
case 0xe8: case 0xe9: case 0xea: return "e";
case 0xec: case 0xed: case 0xee: case 0xef: return "i";
case 0xf0: return "d";
case 0xf1: return "n";
case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf8: return "o";
case 0xf7: return "%";
case 0xf9: case 0xfa: case 0xfb: return "u";
case 0xfd: case 0xff: return "y";
case 0xfe: return "p";
// special characters
case 0xa4: return " euro ";
default: return null;
}
}
public static serverByteBuffer convertUmlaute(serverByteBuffer bb) {
serverByteBuffer t = new serverByteBuffer(bb.length() + 20);
int b0, b1, b2;
String z;
int i = 0;
while (i < bb.length()) {
b0 = bb.byteAt(i) & 0xff;
// check utf-8 encoding
if ((b0 < 128) || (i + 1 == bb.length())) {
t.append(b0);
i++;
} else {
b1 = bb.byteAt(i + 1) & 0xff;
if (b1 > 0x3f) {
z = code_iso8859s(b0);
i++;
} else if ((b0 > 0xbf) && (b0 < 0xe0)) {
z = code_iso8859s(((b0 & 0x1f) << 0x6) | (b1 & 0x3f));
i += 2;
} else {
if (i + 2 >= bb.length()) {
z = null;
i++;
} else {
b2 = bb.byteAt(i + 2) & 0xff;
if (b2 > 0x3f) {
z = code_iso8859s(b0);
i++;
} else {
z = code_iso8859s(((b0 & 0xf) << 0xc) | ((b1 & 0x3f) << 0x6) | (b2 & 0x3f));
i += 3;
}
}
}
if (z == null) t.append(b0); else t.append(z);
}
}
return t;
}
private static byte[] transscript(byte[] code) {
String t = (String) trans.get(new String(code));
if (t == null) return new byte[0]; else return t.getBytes();
}
protected static serverByteBuffer transscriptAll(serverByteBuffer bb) {
int p0 = 0, p1;
byte[] t;
while ((p0 = bb.indexOf((byte) '&', p0)) >= 0) {
p1 = bb.indexOf((byte) ';', p0);
if (p1 >= 0) {
t = transscript(bb.getBytes(p0, p1 + 1));
bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getBytes(p1 + 1));
} else {
bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).append(bb.getBytes(p0 + 1));
}
}
t = null;
return bb;
}
protected static serverByteBuffer stripAllTags(serverByteBuffer bb) {
int p0 = 0, p1;
while ((p0 = bb.indexOf(lb, p0)) >= 0) {
p1 = bb.indexOf(rb, p0);
if (p1 >= 0) {
bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + 1).trim().append((byte) 32).append(new serverByteBuffer(bb.getBytes(p1 + 1)).trim());
} else {
bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).trim().append(new serverByteBuffer(bb.getBytes(p0 + 1)).trim());
}
}
return bb.trim();
}
public static serverByteBuffer stripAll(serverByteBuffer bb) {
//return stripAllTags(s);
return convertUmlaute(transscriptAll(stripAllTags(bb)));
}
public void close() {
// free resources
tags0 = null;
tags1 = null;
}
public void finalize() {
close();
}
}
\ No newline at end of file
+// htmlFilterAbstractScraper.java
// ---------------------------
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
// last major change: 18.02.2004
//
// You agree that the Author(s) is (are) not responsible for cost,
// loss of data or any harm that may be caused by usage of this softare or
// this documentation. The usage of this software is on your own risk. The
// installation and usage (starting/running) of this software may allow other
// people or application to access your computer and any attached devices and
// is highly dependent on the configuration of the software which must be
// done by the user of the software;the author(s) is (are) also
// not responsible for proper configuration and usage of the software, even
// if provoked by documentation provided together with the software.
//
// THE SOFTWARE THAT FOLLOWS AS ART OF PROGRAMMING BELOW THIS SECTION
// IS PUBLISHED UNDER THE GPL AS DOCUMENTED IN THE FILE gpl.txt ASIDE THIS
// FILE AND AS IN http://www.gnu.org/licenses/gpl.txt
// ANY CHANGES TO THIS FILE ACCORDING TO THE GPL CAN BE DONE TO THE
// LINES THAT FOLLOWS THIS COPYRIGHT NOTICE HERE, BUT CHANGES MUST NOT
// BE DONE ABOVE OR INSIDE THE COPYRIGHT NOTICE. A RE-DISTRIBUTION
// MUST CONTAIN THE INTACT AND UNCHANGED COPYRIGHT NOTICE.
// CONTRIBUTIONS AND CHANGES TO THE PROGRAM CODE SHOULD BE MARKED AS SUCH.
package de.anomic.htmlFilter;
import java.util.TreeSet;
import java.util.HashMap;
import java.util.Properties;
import de.anomic.server.serverByteBuffer;
public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
public static final byte lb = (byte) '<';
public static final byte rb = (byte) '>';
public static final byte sl = (byte) '/';
private TreeSet tags0;
private TreeSet tags1;
// define a translation table for html character codings
private static HashMap trans = new HashMap(300);
static {
trans.put(""", "\""); //Anführungszeichen oben
trans.put("&", "&"); //Ampersand-Zeichen, kaufmännisches Und
trans.put("<", "<"); //öffnende spitze Klammer
trans.put(">", ">"); //schließende spitze Klammer
trans.put(" ", " "); //Erzwungenes Leerzeichen
trans.put("¡", "!"); //umgekehrtes Ausrufezeichen
trans.put("¢", " cent "); //Cent-Zeichen
trans.put("£", " pound "); //Pfund-Zeichen
trans.put("¤", " currency "); //Währungs-Zeichen
trans.put("¥", " yen "); //Yen-Zeichen
trans.put("¦", " "); //durchbrochener Strich
trans.put("§", " paragraph "); //Paragraph-Zeichen
trans.put("¨", " "); //Pünktchen oben
trans.put("©", " copyright "); //Copyright-Zeichen
trans.put("ª", " "); //Ordinal-Zeichen weiblich
trans.put("«", " "); //angewinkelte Anführungszeichen links
trans.put("¬", " not "); //Verneinungs-Zeichen
trans.put("", "-"); //kurzer Trennstrich
trans.put("®", " trademark "); //Registriermarke-Zeichen
trans.put("¯", " "); //Überstrich
trans.put("°", " degree "); //Grad-Zeichen
trans.put("±", " +/- "); //Plusminus-Zeichen
trans.put("²", " square "); //Hoch-2-Zeichen
trans.put("³", " 3 "); //Hoch-3-Zeichen
trans.put("´", " "); //Acute-Zeichen
trans.put("µ", " micro "); //Mikro-Zeichen
trans.put("¶", " paragraph "); //Absatz-Zeichen
trans.put("·", " "); //Mittelpunkt
trans.put("¸", " "); //Häkchen unten
trans.put("¹", " "); //Hoch-1-Zeichen
trans.put("º", " degree "); //Ordinal-Zeichen männlich
trans.put("»", " "); //angewinkelte Anführungszeichen rechts
trans.put("¼", " quarter "); //ein Viertel
trans.put("½", " half "); //ein Halb
trans.put("¾", " 3/4 "); //drei Viertel
trans.put("¿", "?"); //umgekehrtes Fragezeichen
trans.put("À", "A"); //A mit Accent grave
trans.put("Á", "A"); //A mit Accent acute
trans.put("Â", "A"); //A mit Circumflex
trans.put("Ã", "A"); //A mit Tilde
trans.put("Ä", "Ae"); //A Umlaut
trans.put("Å", "A"); //A mit Ring
trans.put("Æ", "A"); //A mit legiertem E
trans.put("Ç", "C"); //C mit Häkchen
trans.put("È", "E"); //E mit Accent grave
trans.put("É", "E"); //E mit Accent acute
trans.put("Ê", "E"); //E mit Circumflex
trans.put("Ë", "E"); //E Umlaut
trans.put("Ì", "I"); //I mit Accent grave
trans.put("Í", "I"); //I mit Accent acute
trans.put("Î", "I"); //I mit Circumflex
trans.put("Ï", "I"); //I Umlaut
trans.put("Ð", "D"); //Eth (isländisch)
trans.put("Ñ", "N"); //N mit Tilde
trans.put("Ò", "O"); //O mit Accent grave
trans.put("Ó", "O"); //O mit Accent acute
trans.put("Ô", "O"); //O mit Circumflex
trans.put("Õ", "O"); //O mit Tilde
trans.put("Ö", "Oe"); //O Umlaut
trans.put("×", " times "); //Mal-Zeichen
trans.put("Ø", "O"); //O mit Schrägstrich
trans.put("Ù", "U"); //U mit Accent grave
trans.put("Ú", "U"); //U mit Accent acute
trans.put("Û", "U"); //U mit Circumflex
trans.put("Ü", "Ue"); //U Umlaut
trans.put("Ý", "Y"); //Y mit Accent acute
trans.put("Þ", "P"); //THORN (isländisch)
trans.put("ß", "ss"); //scharfes S
trans.put("à", "a"); //a mit Accent grave
trans.put("á", "a"); //a mit Accent acute
trans.put("â", "a"); //a mit Circumflex
trans.put("ã", "a"); //a mit Tilde
trans.put("ä", "ae"); //a Umlaut
trans.put("å", "a"); //a mit Ring
trans.put("æ", "a"); //a mit legiertem e
trans.put("ç", "c"); //c mit Häkchen
trans.put("è", "e"); //e mit Accent grave
trans.put("é", "e"); //e mit Accent acute
trans.put("ê", "e"); //e mit Circumflex
trans.put("ë", "e"); //e Umlaut
trans.put("ì", "i"); //i mit Accent grave
trans.put("í", "i"); //i mit Accent acute
trans.put("î", "i"); //i mit Circumflex
trans.put("ï", "i"); //i Umlaut
trans.put("ð", "d"); //eth (isländisch)
trans.put("ñ", "n"); //n mit Tilde
trans.put("ò", "o"); //o mit Accent grave
trans.put("ó", "o"); //o mit Accent acute
trans.put("ô", "o"); //o mit Circumflex
trans.put("õ", "o"); //o mit Tilde
trans.put("ö", "oe"); //o Umlaut
trans.put("÷", "%"); //Divisions-Zeichen
trans.put("ø", "o"); //o mit Schrägstrich
trans.put("ù", "u"); //u mit Accent grave
trans.put("ú", "u"); //u mit Accent acute
trans.put("û", "u"); //u mit Circumflex
trans.put("ü", "ue"); //u Umlaut
trans.put("ý", "y"); //y mit Accent acute
trans.put("þ", "p"); //thorn (isländisch)
trans.put("ÿ", "y"); //y Umlaut
trans.put("Α", " Alpha "); //Alpha groß
trans.put("α", " alpha "); //alpha klein
trans.put("Β", " Beta "); //Beta groß
trans.put("β", " beta "); //beta klein
trans.put("Γ", " Gamma "); //Gamma groß
trans.put("γ", " gamma "); //gamma klein
trans.put("Δ", " Delta "); //Delta groß
trans.put("δ", " delta "); //delta klein
trans.put("Ε", " Epsilon "); //Epsilon groß
trans.put("ε", " epsilon "); //epsilon klein
trans.put("Ζ", " Zeta "); //Zeta groß
trans.put("ζ", " zeta "); //zeta klein
trans.put("Η", " Eta "); //Eta groß
trans.put("η", " eta "); //eta klein
trans.put("Θ", " Theta "); //Theta groß
trans.put("θ", " theta "); //theta klein
trans.put("Ι", " Iota "); //Iota groß
trans.put("ι", " iota "); //iota klein
trans.put("Κ", " Kappa "); //Kappa groß
trans.put("κ", " kappa "); //kappa klein
trans.put("Λ", " Lambda "); //Lambda groß
trans.put("λ", " lambda "); //lambda klein
trans.put("Μ", " Mu "); //Mu groß
trans.put("μ", " mu "); //mu klein
trans.put("Ν", " Nu "); //Nu groß
trans.put("ν", " nu "); //nu klein
trans.put("Ξ", " Xi "); //Xi groß
trans.put("ξ", " xi "); //xi klein
trans.put("Ο", " Omicron "); //Omicron groß
trans.put("ο", " omicron "); //omicron klein
trans.put("Π", " Pi "); //Pi groß
trans.put("π", " pi "); //pi klein
trans.put("Ρ", " Rho "); //Rho groß
trans.put("ρ", " rho "); //rho klein
trans.put("Σ", " Sigma "); //Sigma groß
trans.put("ς", " sigma "); //sigmaf klein
trans.put("σ", " sigma "); //sigma klein
trans.put("Τ", " Tau "); //Tau groß
trans.put("τ", " tau "); //tau klein
trans.put("Υ", " Ypsilon "); //Upsilon groß
trans.put("υ", " ypsilon "); //upsilon klein
trans.put("Φ", " Phi "); //Phi groß
trans.put("φ", " phi "); //phi klein
trans.put("Χ", " Chi "); //Chi groß
trans.put("χ", " chi "); //chi klein
trans.put("Ψ", " Psi "); //Psi groß
trans.put("ψ", " psi "); //psi klein
trans.put("Ω", " Omega "); //Omega groß
trans.put("ω", " omega "); //omega klein
trans.put("ϑ", " theta "); //theta Symbol
trans.put("ϒ", " ypsilon "); //upsilon mit Haken
trans.put("ϖ", " pi "); //pi Symbol
trans.put("∀", " for all "); //für alle
trans.put("∂", " part of "); //teilweise
trans.put("∃", " exists "); //existiert
trans.put("∅", " null "); //leer
trans.put("∇", " nabla "); //nabla
trans.put("∈", " element of "); //Element von
trans.put("∉", " not element of "); //kein Element von
trans.put("∋", " contains "); //enthält als Element
trans.put("∏", " product "); //Produkt
trans.put("∑", " sum "); //Summe
trans.put("−", " minus "); //minus
trans.put("∗", " times "); //Asterisk
trans.put("√", " sqare root "); //Quadratwurzel
trans.put("∝", " proportional to "); //proportional zu
trans.put("∞", " unlimited "); //unendlich
trans.put("∠", " angle "); //Winkel
trans.put("∧", " and "); //und
trans.put("∨", " or "); //oder
trans.put("∩", " "); //Schnittpunkt
trans.put("∪", " unity "); //Einheit
trans.put("∫", " integral "); //Integral
trans.put("∴", " cause "); //deshalb
trans.put("∼", " similar to "); //ähnlich wie
trans.put("≅", " equal "); //annähernd gleich
trans.put("≈", " equal "); //beinahe gleich
trans.put("≠", " not equal "); //ungleich
trans.put("≡", " identical "); //identisch mit
trans.put("≤", " smaller or equal than "); //kleiner gleich
trans.put("≥", " greater or equal than "); //größer gleich
trans.put("⊂", " subset of "); //Untermenge von
trans.put("⊃", " superset of "); //Obermenge von
trans.put("⊄", " not subset of "); //keine Untermenge von
trans.put("⊆", ""); //Untermenge von oder gleich mit
trans.put("⊇", ""); //Obermenge von oder gleich mit
trans.put("⊕", ""); //Direktsumme
trans.put("⊗", ""); //Vektorprodukt
trans.put("⊥", ""); //senkrecht zu
trans.put("⋅", ""); //Punkt-Operator
trans.put("◊", ""); //Raute
trans.put("⌈", ""); //links oben
trans.put("⌉", ""); //rechts oben
trans.put("⌊", ""); //links unten
trans.put("⌋", ""); //rechts unten
trans.put("〈", ""); //spitze Klammer links
trans.put("〉", ""); //spitze Klammer rechts
trans.put("←", ""); //Pfeil links
trans.put("↑", ""); //Pfeil oben
trans.put("→", ""); //Pfeil rechts
trans.put("↓", ""); //Pfeil unten
trans.put("↔", ""); //Pfeil links/rechts
trans.put("↵", ""); //Pfeil unten-Knick-links
trans.put("⇐", ""); //Doppelpfeil links
trans.put("⇑", ""); //Doppelpfeil oben
trans.put("⇒", ""); //Doppelpfeil rechts
trans.put("⇓", ""); //Doppelpfeil unten
trans.put("⇔", ""); //Doppelpfeil links/rechts
trans.put("•", ""); //Bullet-Zeichen
trans.put("…", ""); //Horizontale Ellipse
trans.put("′", ""); //Minutenzeichen
trans.put("‾", ""); //Überstrich
trans.put("⁄", ""); //Bruchstrich
trans.put("℘", ""); //Weierstrass p
trans.put("ℑ", ""); //Zeichen für "imaginär"
trans.put("ℜ", ""); //Zeichen für "real"
trans.put("™", ""); //Trademark-Zeichen
trans.put("€", ""); //Euro-Zeichen
trans.put("ℵ", ""); //Alef-Symbol
trans.put("♠", ""); //Pik-Zeichen
trans.put("♣", ""); //Kreuz-Zeichen
trans.put("♥", ""); //Herz-Zeichen
trans.put("♦", ""); //Karo-Zeichen
trans.put(" ", ""); //Leerzeichen Breite n
trans.put(" ", ""); //Leerzeichen Breite m
trans.put(" ", ""); //Schmales Leerzeichen
trans.put("", ""); //null breiter Nichtverbinder
trans.put("", ""); //null breiter Verbinder
trans.put("", ""); //links-nach-rechts-Zeichen
trans.put("", ""); //rechts-nach-links-Zeichen
trans.put("–", ""); //Gedankenstrich Breite n
trans.put("—", ""); //Gedankenstrich Breite m
trans.put("‘", ""); //einfaches Anführungszeichen links
trans.put("’", ""); //einfaches Anführungszeichen rechts
trans.put("‚", ""); //einfaches low-9-Zeichen
trans.put("“", ""); //doppeltes Anführungszeichen links
trans.put("”", ""); //doppeltes Anführungszeichen rechts
trans.put("„", ""); //doppeltes low-9-Zeichen rechts
trans.put("†", ""); //Kreuz
trans.put("‡", ""); //Doppelkreuz
trans.put("‰", ""); //zu tausend
trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links
trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts
}
public htmlFilterAbstractScraper(TreeSet tags0, TreeSet tags1) {
this.tags0 = tags0;
this.tags1 = tags1;
}
public boolean isTag0(String tag) {
return (tags0 != null) && (tags0.contains(tag));
}
public boolean isTag1(String tag) {
return (tags1 != null) && (tags1.contains(tag));
}
//the 'missing' method that shall be implemented:
public abstract void scrapeText(byte[] text);
// the other methods must take into account to construct the return value correctly
public void scrapeTag0(String tagname, Properties tagopts) {
}
public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
}
// string conversions
private static String code_iso8859s(int c) {
switch ((int) c & 0xff) {
// german umlaute and ligaturen
case 0xc4: return "AE"; case 0xd6: return "OE"; case 0xdc: return "UE";
case 0xe4: return "ae"; case 0xf6: return "oe"; case 0xfc: return "ue";
case 0xdf: return "ss";
// accent on letters; i.e. french characters
case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc5: return "A";
case 0xc6: return "AE";
case 0xc7: return "C";
case 0xc8: case 0xc9: case 0xca: return "E";
case 0xcc: case 0xcd: case 0xce: case 0xcf: return "I";
case 0xd0: return "D";
case 0xd1: return "N";
case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd8: return "O";
case 0xd7: return "x";
case 0xd9: case 0xda: case 0xdb: return "U";
case 0xdd: return "Y";
case 0xde: return "p";
case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe5: return "a";
case 0xe6: return "ae";
case 0xe7: return "c";
case 0xe8: case 0xe9: case 0xea: return "e";
case 0xec: case 0xed: case 0xee: case 0xef: return "i";
case 0xf0: return "d";
case 0xf1: return "n";
case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf8: return "o";
case 0xf7: return "%";
case 0xf9: case 0xfa: case 0xfb: return "u";
case 0xfd: case 0xff: return "y";
case 0xfe: return "p";
// special characters
case 0xa4: return " euro ";
default: return null;
}
}
public static serverByteBuffer convertUmlaute(serverByteBuffer bb) {
serverByteBuffer t = new serverByteBuffer(bb.length() + 20);
int b0, b1, b2;
String z;
int i = 0;
while (i < bb.length()) {
b0 = bb.byteAt(i) & 0xff;
// check utf-8 encoding
if ((b0 < 128) || (i + 1 == bb.length())) {
t.append(b0);
i++;
} else {
b1 = bb.byteAt(i + 1) & 0xff;
if (b1 > 0x3f) {
z = code_iso8859s(b0);
i++;
} else if ((b0 > 0xbf) && (b0 < 0xe0)) {
z = code_iso8859s(((b0 & 0x1f) << 0x6) | (b1 & 0x3f));
i += 2;
} else {
if (i + 2 >= bb.length()) {
z = null;
i++;
} else {
b2 = bb.byteAt(i + 2) & 0xff;
if (b2 > 0x3f) {
z = code_iso8859s(b0);
i++;
} else {
z = code_iso8859s(((b0 & 0xf) << 0xc) | ((b1 & 0x3f) << 0x6) | (b2 & 0x3f));
i += 3;
}
}
}
if (z == null) t.append(b0); else t.append(z);
}
}
return t;
}
private static byte[] transscript(byte[] code) {
String t = (String) trans.get(new String(code));
if (t == null) return new byte[0]; else return t.getBytes();
}
protected static serverByteBuffer transscriptAll(serverByteBuffer bb) {
int p0 = 0, p1;
byte[] t;
while ((p0 = bb.indexOf((byte) '&', p0)) >= 0) {
p1 = bb.indexOf((byte) ';', p0);
if (p1 >= 0) {
t = transscript(bb.getBytes(p0, p1 + 1));
bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getBytes(p1 + 1));
} else {
bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).append(bb.getBytes(p0 + 1));
}
}
t = null;
return bb;
}
protected static serverByteBuffer stripAllTags(serverByteBuffer bb) {
int p0 = 0, p1;
while ((p0 = bb.indexOf(lb, p0)) >= 0) {
p1 = bb.indexOf(rb, p0);
if (p1 >= 0) {
bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + 1).trim().append((byte) 32).append(new serverByteBuffer(bb.getBytes(p1 + 1)).trim());
} else {
bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).trim().append(new serverByteBuffer(bb.getBytes(p0 + 1)).trim());
}
}
return bb.trim();
}
public static serverByteBuffer stripAll(serverByteBuffer bb) {
//return stripAllTags(s);
return convertUmlaute(transscriptAll(stripAllTags(bb)));
}
public void close() {
// free resources
tags0 = null;
tags1 = null;
}
public void finalize() {
close();
}
}
\ No newline at end of file
diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractTransformer.java b/source/de/anomic/htmlFilter/htmlFilterAbstractTransformer.java
index a22e84e27..3d36e4a36 100644
--- a/source/de/anomic/htmlFilter/htmlFilterAbstractTransformer.java
+++ b/source/de/anomic/htmlFilter/htmlFilterAbstractTransformer.java
@@ -40,15 +40,15 @@
package de.anomic.htmlFilter;
-import java.util.HashSet;
+import java.util.TreeSet;
import java.util.Properties;
public abstract class htmlFilterAbstractTransformer implements htmlFilterTransformer {
- private HashSet tags0;
- private HashSet tags1;
+ private TreeSet tags0;
+ private TreeSet tags1;
- public htmlFilterAbstractTransformer(HashSet tags0, HashSet tags1) {
+ public htmlFilterAbstractTransformer(TreeSet tags0, TreeSet tags1) {
this.tags0 = tags0;
this.tags1 = tags1;
}
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index 89d5e6783..7fe778d79 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -43,9 +43,11 @@ package de.anomic.htmlFilter;
import java.net.URL;
import java.net.MalformedURLException;
import java.util.HashMap;
-import java.util.HashSet;
+import java.util.TreeSet;
import java.util.Map;
import java.util.Properties;
+import java.util.Locale;
+import java.text.Collator;
import de.anomic.server.serverByteBuffer;
@@ -54,15 +56,20 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
// statics: for initialisation of the HTMLFilterAbstractScraper
- private static HashSet linkTags0;
- private static HashSet linkTags1;
-
+ private static TreeSet linkTags0;
+ private static TreeSet linkTags1;
+ private static final Collator insensitiveCollator = Collator.getInstance(Locale.US);
+ static {
+ insensitiveCollator.setStrength(Collator.SECONDARY);
+ insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
+ }
+
static {
- linkTags0 = new HashSet();
+ linkTags0 = new TreeSet(insensitiveCollator);
linkTags0.add("img");
linkTags0.add("base");
- linkTags1 = new HashSet();
+ linkTags1 = new TreeSet(insensitiveCollator);
linkTags1.add("a");
linkTags1.add("h1");
linkTags1.add("title");
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
index a744c0b9d..0d9358131 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
@@ -43,23 +43,30 @@ package de.anomic.htmlFilter;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
-import java.util.HashSet;
+import java.util.TreeSet;
import java.util.Properties;
import java.util.Vector;
+import java.util.Locale;
+import java.text.Collator;
import de.anomic.server.serverByteBuffer;
public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer implements htmlFilterTransformer {
// statics: for initialisation of the HTMLFilterAbstractTransformer
- private static HashSet linkTags0;
- private static HashSet linkTags1;
-
+ private static TreeSet linkTags0;
+ private static TreeSet linkTags1;
+ private static final Collator insensitiveCollator = Collator.getInstance(Locale.US);
+ static {
+ insensitiveCollator.setStrength(Collator.SECONDARY);
+ insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
+ }
+
static {
- linkTags0 = new HashSet();
+ linkTags0 = new TreeSet(insensitiveCollator);
linkTags0.add("img");
- linkTags1 = new HashSet();
+ linkTags1 = new TreeSet(insensitiveCollator);
linkTags1.add("a");
}
diff --git a/source/de/anomic/htmlFilter/htmlFilterOutputStream.java b/source/de/anomic/htmlFilter/htmlFilterOutputStream.java
index 5408f9612..df6369973 100644
--- a/source/de/anomic/htmlFilter/htmlFilterOutputStream.java
+++ b/source/de/anomic/htmlFilter/htmlFilterOutputStream.java
@@ -254,14 +254,14 @@ public final class htmlFilterOutputStream extends OutputStream {
if (in[1] == '/') {
// a closing tag
tagend = tagEnd(in, 2);
- tag = new String(in, 2, tagend - 2).toLowerCase();
+ tag = new String(in, 2, tagend - 2);
byte[] text = new byte[in.length - tagend - 1];
System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
return filterTag(tag, false, text, quotechar);
} else {
// an opening tag
tagend = tagEnd(in, 1);
- tag = new String(in, 1, tagend - 1).toLowerCase();
+ tag = new String(in, 1, tagend - 1);
byte[] text = new byte[in.length - tagend - 1];
System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
return filterTag(tag, true, text, quotechar);
diff --git a/source/de/anomic/http/httpd.java b/source/de/anomic/http/httpd.java
index d7f509ce0..46e951125 100644
--- a/source/de/anomic/http/httpd.java
+++ b/source/de/anomic/http/httpd.java
@@ -223,6 +223,8 @@ public final class httpd implements serverHandler {
int pos;
while (st.hasMoreTokens()) {
pattern = st.nextToken();
+ if (key.matches(pattern)) return true;
+ /*
pos = pattern.indexOf("*");
if (pos < 0) {
// no wild card: exact match
@@ -232,6 +234,7 @@ public final class httpd implements serverHandler {
if ((key.startsWith(pattern.substring(0, pos))) &&
(key.endsWith(pattern.substring(pos + 1)))) return true;
}
+ */
}
return false;
}
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index 64fa0dd16..97fbe0a98 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -244,7 +244,7 @@ public class plasmaSnippetCache {
hash = (String) j.next();
pos = (Integer) hs.get(hash);
if (pos == null) {
- remaininghashes.add(hash);
+ remaininghashes.add(new String(hash));
} else {
p = pos.intValue();
if (p > maxpos) maxpos = p;
diff --git a/yacy.blue b/yacy.blue
index e69de29bb..7d64d50e8 100644
--- a/yacy.blue
+++ b/yacy.blue
@@ -0,0 +1 @@
+ebcblue