diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java
index 1b83dc28b..159ca48e0 100644
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@@ -121,8 +121,9 @@ public class CacheAdmin_p {
else {
htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
- plasmaParserDocument document = switchboard.parser.transformScraper(url, "text/html", scraper);
serverFileUtils.copy(file, os);
+ os.flush();
+ plasmaParserDocument document = switchboard.parser.transformScraper(url, "text/html", scraper);
info += "HEADLINE:
" + scraper.getHeadline() + "
";
info += "HREF:
" + formatAnchor(document.getHyperlinks()) + "
";
info += "MEDIA:
" + formatAnchor(document.getMedialinks()) + "
";
@@ -130,7 +131,7 @@ public class CacheAdmin_p {
info += "TEXT:
" + new String(scraper.getText()) + "
";
info += "LINES:
";
String[] sentences = document.getSentences();
- for (int i = 0; i < sentences.length; i++) info += sentences + "
";
+ for (int i = 0; i < sentences.length; i++) info += sentences[i] + "
";
info += "
";
}
} catch (Exception e) {
diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
index a91936ea9..591ca5881 100644
--- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
@@ -1,434 +1 @@
-// htmlFilterAbstractScraper.java
-// ---------------------------
-// (C) by Michael Peter Christen; mc@anomic.de
-// first published on http://www.anomic.de
-// Frankfurt, Germany, 2004
-// last major change: 18.02.2004
-//
-// You agree that the Author(s) is (are) not responsible for cost,
-// loss of data or any harm that may be caused by usage of this softare or
-// this documentation. The usage of this software is on your own risk. The
-// installation and usage (starting/running) of this software may allow other
-// people or application to access your computer and any attached devices and
-// is highly dependent on the configuration of the software which must be
-// done by the user of the software;the author(s) is (are) also
-// not responsible for proper configuration and usage of the software, even
-// if provoked by documentation provided together with the software.
-//
-// THE SOFTWARE THAT FOLLOWS AS ART OF PROGRAMMING BELOW THIS SECTION
-// IS PUBLISHED UNDER THE GPL AS DOCUMENTED IN THE FILE gpl.txt ASIDE THIS
-// FILE AND AS IN http://www.gnu.org/licenses/gpl.txt
-// ANY CHANGES TO THIS FILE ACCORDING TO THE GPL CAN BE DONE TO THE
-// LINES THAT FOLLOWS THIS COPYRIGHT NOTICE HERE, BUT CHANGES MUST NOT
-// BE DONE ABOVE OR INSIDE THE COPYRIGHT NOTICE. A RE-DISTRIBUTION
-// MUST CONTAIN THE INTACT AND UNCHANGED COPYRIGHT NOTICE.
-// CONTRIBUTIONS AND CHANGES TO THE PROGRAM CODE SHOULD BE MARKED AS SUCH.
-
-package de.anomic.htmlFilter;
-
-import java.util.HashSet;
-import java.util.HashMap;
-import java.util.Properties;
-
-import de.anomic.server.serverByteBuffer;
-
-public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
-
- public static final byte lb = (byte) '<';
- public static final byte rb = (byte) '>';
- public static final byte sl = (byte) '/';
-
- private HashSet tags0;
- private HashSet tags1;
-
- // define a translation table for html character codings
- private static HashMap trans = new HashMap(300);
- static {
- trans.put(""", "\""); //Anführungszeichen oben
- trans.put("&", "&"); //Ampersand-Zeichen, kaufmännisches Und
- trans.put("<", "<"); //öffnende spitze Klammer
- trans.put(">", ">"); //schließende spitze Klammer
- trans.put(" ", " "); //Erzwungenes Leerzeichen
- trans.put("¡", "!"); //umgekehrtes Ausrufezeichen
- trans.put("¢", " cent "); //Cent-Zeichen
- trans.put("£", " pound "); //Pfund-Zeichen
- trans.put("¤", " currency "); //Währungs-Zeichen
- trans.put("¥", " yen "); //Yen-Zeichen
- trans.put("¦", " "); //durchbrochener Strich
- trans.put("§", " paragraph "); //Paragraph-Zeichen
- trans.put("¨", " "); //Pünktchen oben
- trans.put("©", " copyright "); //Copyright-Zeichen
- trans.put("ª", " "); //Ordinal-Zeichen weiblich
- trans.put("«", " "); //angewinkelte Anführungszeichen links
- trans.put("¬", " not "); //Verneinungs-Zeichen
- trans.put("", "-"); //kurzer Trennstrich
- trans.put("®", " trademark "); //Registriermarke-Zeichen
- trans.put("¯", " "); //Überstrich
- trans.put("°", " degree "); //Grad-Zeichen
- trans.put("±", " +/- "); //Plusminus-Zeichen
- trans.put("²", " square "); //Hoch-2-Zeichen
- trans.put("³", " 3 "); //Hoch-3-Zeichen
- trans.put("´", " "); //Acute-Zeichen
- trans.put("µ", " micro "); //Mikro-Zeichen
- trans.put("¶", " paragraph "); //Absatz-Zeichen
- trans.put("·", " "); //Mittelpunkt
- trans.put("¸", " "); //Häkchen unten
- trans.put("¹", " "); //Hoch-1-Zeichen
- trans.put("º", " degree "); //Ordinal-Zeichen männlich
- trans.put("»", " "); //angewinkelte Anführungszeichen rechts
- trans.put("¼", " quarter "); //ein Viertel
- trans.put("½", " half "); //ein Halb
- trans.put("¾", " 3/4 "); //drei Viertel
- trans.put("¿", "?"); //umgekehrtes Fragezeichen
- trans.put("À", "A"); //A mit Accent grave
- trans.put("Á", "A"); //A mit Accent acute
- trans.put("Â", "A"); //A mit Circumflex
- trans.put("Ã", "A"); //A mit Tilde
- trans.put("Ä", "Ae"); //A Umlaut
- trans.put("Å", "A"); //A mit Ring
- trans.put("Æ", "A"); //A mit legiertem E
- trans.put("Ç", "C"); //C mit Häkchen
- trans.put("È", "E"); //E mit Accent grave
- trans.put("É", "E"); //E mit Accent acute
- trans.put("Ê", "E"); //E mit Circumflex
- trans.put("Ë", "E"); //E Umlaut
- trans.put("Ì", "I"); //I mit Accent grave
- trans.put("Í", "I"); //I mit Accent acute
- trans.put("Î", "I"); //I mit Circumflex
- trans.put("Ï", "I"); //I Umlaut
- trans.put("Ð", "D"); //Eth (isländisch)
- trans.put("Ñ", "N"); //N mit Tilde
- trans.put("Ò", "O"); //O mit Accent grave
- trans.put("Ó", "O"); //O mit Accent acute
- trans.put("Ô", "O"); //O mit Circumflex
- trans.put("Õ", "O"); //O mit Tilde
- trans.put("Ö", "Oe"); //O Umlaut
- trans.put("×", " times "); //Mal-Zeichen
- trans.put("Ø", "O"); //O mit Schrägstrich
- trans.put("Ù", "U"); //U mit Accent grave
- trans.put("Ú", "U"); //U mit Accent acute
- trans.put("Û", "U"); //U mit Circumflex
- trans.put("Ü", "Ue"); //U Umlaut
- trans.put("Ý", "Y"); //Y mit Accent acute
- trans.put("Þ", "P"); //THORN (isländisch)
- trans.put("ß", "ss"); //scharfes S
- trans.put("à", "a"); //a mit Accent grave
- trans.put("á", "a"); //a mit Accent acute
- trans.put("â", "a"); //a mit Circumflex
- trans.put("ã", "a"); //a mit Tilde
- trans.put("ä", "ae"); //a Umlaut
- trans.put("å", "a"); //a mit Ring
- trans.put("æ", "a"); //a mit legiertem e
- trans.put("ç", "c"); //c mit Häkchen
- trans.put("è", "e"); //e mit Accent grave
- trans.put("é", "e"); //e mit Accent acute
- trans.put("ê", "e"); //e mit Circumflex
- trans.put("ë", "e"); //e Umlaut
- trans.put("ì", "i"); //i mit Accent grave
- trans.put("í", "i"); //i mit Accent acute
- trans.put("î", "i"); //i mit Circumflex
- trans.put("ï", "i"); //i Umlaut
- trans.put("ð", "d"); //eth (isländisch)
- trans.put("ñ", "n"); //n mit Tilde
- trans.put("ò", "o"); //o mit Accent grave
- trans.put("ó", "o"); //o mit Accent acute
- trans.put("ô", "o"); //o mit Circumflex
- trans.put("õ", "o"); //o mit Tilde
- trans.put("ö", "oe"); //o Umlaut
- trans.put("÷", "%"); //Divisions-Zeichen
- trans.put("ø", "o"); //o mit Schrägstrich
- trans.put("ù", "u"); //u mit Accent grave
- trans.put("ú", "u"); //u mit Accent acute
- trans.put("û", "u"); //u mit Circumflex
- trans.put("ü", "ue"); //u Umlaut
- trans.put("ý", "y"); //y mit Accent acute
- trans.put("þ", "p"); //thorn (isländisch)
- trans.put("ÿ", "y"); //y Umlaut
- trans.put("Α", " Alpha "); //Alpha groß
- trans.put("α", " alpha "); //alpha klein
- trans.put("Β", " Beta "); //Beta groß
- trans.put("β", " beta "); //beta klein
- trans.put("Γ", " Gamma "); //Gamma groß
- trans.put("γ", " gamma "); //gamma klein
- trans.put("Δ", " Delta "); //Delta groß
- trans.put("δ", " delta "); //delta klein
- trans.put("Ε", " Epsilon "); //Epsilon groß
- trans.put("ε", " epsilon "); //epsilon klein
- trans.put("Ζ", " Zeta "); //Zeta groß
- trans.put("ζ", " zeta "); //zeta klein
- trans.put("Η", " Eta "); //Eta groß
- trans.put("η", " eta "); //eta klein
- trans.put("Θ", " Theta "); //Theta groß
- trans.put("θ", " theta "); //theta klein
- trans.put("Ι", " Iota "); //Iota groß
- trans.put("ι", " iota "); //iota klein
- trans.put("Κ", " Kappa "); //Kappa groß
- trans.put("κ", " kappa "); //kappa klein
- trans.put("Λ", " Lambda "); //Lambda groß
- trans.put("λ", " lambda "); //lambda klein
- trans.put("Μ", " Mu "); //Mu groß
- trans.put("μ", " mu "); //mu klein
- trans.put("Ν", " Nu "); //Nu groß
- trans.put("ν", " nu "); //nu klein
- trans.put("Ξ", " Xi "); //Xi groß
- trans.put("ξ", " xi "); //xi klein
- trans.put("Ο", " Omicron "); //Omicron groß
- trans.put("ο", " omicron "); //omicron klein
- trans.put("Π", " Pi "); //Pi groß
- trans.put("π", " pi "); //pi klein
- trans.put("Ρ", " Rho "); //Rho groß
- trans.put("ρ", " rho "); //rho klein
- trans.put("Σ", " Sigma "); //Sigma groß
- trans.put("ς", " sigma "); //sigmaf klein
- trans.put("σ", " sigma "); //sigma klein
- trans.put("Τ", " Tau "); //Tau groß
- trans.put("τ", " tau "); //tau klein
- trans.put("Υ", " Ypsilon "); //Upsilon groß
- trans.put("υ", " ypsilon "); //upsilon klein
- trans.put("Φ", " Phi "); //Phi groß
- trans.put("φ", " phi "); //phi klein
- trans.put("Χ", " Chi "); //Chi groß
- trans.put("χ", " chi "); //chi klein
- trans.put("Ψ", " Psi "); //Psi groß
- trans.put("ψ", " psi "); //psi klein
- trans.put("Ω", " Omega "); //Omega groß
- trans.put("ω", " omega "); //omega klein
- trans.put("ϑ", " theta "); //theta Symbol
- trans.put("ϒ", " ypsilon "); //upsilon mit Haken
- trans.put("ϖ", " pi "); //pi Symbol
- trans.put("∀", " for all "); //für alle
- trans.put("∂", " part of "); //teilweise
- trans.put("∃", " exists "); //existiert
- trans.put("∅", " null "); //leer
- trans.put("∇", " nabla "); //nabla
- trans.put("∈", " element of "); //Element von
- trans.put("∉", " not element of "); //kein Element von
- trans.put("∋", " contains "); //enthält als Element
- trans.put("∏", " product "); //Produkt
- trans.put("∑", " sum "); //Summe
- trans.put("−", " minus "); //minus
- trans.put("∗", " times "); //Asterisk
- trans.put("√", " sqare root "); //Quadratwurzel
- trans.put("∝", " proportional to "); //proportional zu
- trans.put("∞", " unlimited "); //unendlich
- trans.put("∠", " angle "); //Winkel
- trans.put("∧", " and "); //und
- trans.put("∨", " or "); //oder
- trans.put("∩", " "); //Schnittpunkt
- trans.put("∪", " unity "); //Einheit
- trans.put("∫", " integral "); //Integral
- trans.put("∴", " cause "); //deshalb
- trans.put("∼", " similar to "); //ähnlich wie
- trans.put("≅", " equal "); //annähernd gleich
- trans.put("≈", " equal "); //beinahe gleich
- trans.put("≠", " not equal "); //ungleich
- trans.put("≡", " identical "); //identisch mit
- trans.put("≤", " smaller or equal than "); //kleiner gleich
- trans.put("≥", " greater or equal than "); //größer gleich
- trans.put("⊂", " subset of "); //Untermenge von
- trans.put("⊃", " superset of "); //Obermenge von
- trans.put("⊄", " not subset of "); //keine Untermenge von
- trans.put("⊆", ""); //Untermenge von oder gleich mit
- trans.put("⊇", ""); //Obermenge von oder gleich mit
- trans.put("⊕", ""); //Direktsumme
- trans.put("⊗", ""); //Vektorprodukt
- trans.put("⊥", ""); //senkrecht zu
- trans.put("⋅", ""); //Punkt-Operator
- trans.put("◊", ""); //Raute
- trans.put("⌈", ""); //links oben
- trans.put("⌉", ""); //rechts oben
- trans.put("⌊", ""); //links unten
- trans.put("⌋", ""); //rechts unten
- trans.put("〈", ""); //spitze Klammer links
- trans.put("〉", ""); //spitze Klammer rechts
- trans.put("←", ""); //Pfeil links
- trans.put("↑", ""); //Pfeil oben
- trans.put("→", ""); //Pfeil rechts
- trans.put("↓", ""); //Pfeil unten
- trans.put("↔", ""); //Pfeil links/rechts
- trans.put("↵", ""); //Pfeil unten-Knick-links
- trans.put("⇐", ""); //Doppelpfeil links
- trans.put("⇑", ""); //Doppelpfeil oben
- trans.put("⇒", ""); //Doppelpfeil rechts
- trans.put("⇓", ""); //Doppelpfeil unten
- trans.put("⇔", ""); //Doppelpfeil links/rechts
- trans.put("•", ""); //Bullet-Zeichen
- trans.put("…", ""); //Horizontale Ellipse
- trans.put("′", ""); //Minutenzeichen
- trans.put("‾", ""); //Überstrich
- trans.put("⁄", ""); //Bruchstrich
- trans.put("℘", ""); //Weierstrass p
- trans.put("ℑ", ""); //Zeichen für "imaginär"
- trans.put("ℜ", ""); //Zeichen für "real"
- trans.put("™", ""); //Trademark-Zeichen
- trans.put("€", ""); //Euro-Zeichen
- trans.put("ℵ", ""); //Alef-Symbol
- trans.put("♠", ""); //Pik-Zeichen
- trans.put("♣", ""); //Kreuz-Zeichen
- trans.put("♥", ""); //Herz-Zeichen
- trans.put("♦", ""); //Karo-Zeichen
- trans.put(" ", ""); //Leerzeichen Breite n
- trans.put(" ", ""); //Leerzeichen Breite m
- trans.put(" ", ""); //Schmales Leerzeichen
- trans.put("", ""); //null breiter Nichtverbinder
- trans.put("", ""); //null breiter Verbinder
- trans.put("", ""); //links-nach-rechts-Zeichen
- trans.put("", ""); //rechts-nach-links-Zeichen
- trans.put("–", ""); //Gedankenstrich Breite n
- trans.put("—", ""); //Gedankenstrich Breite m
- trans.put("‘", ""); //einfaches Anführungszeichen links
- trans.put("’", ""); //einfaches Anführungszeichen rechts
- trans.put("‚", ""); //einfaches low-9-Zeichen
- trans.put("“", ""); //doppeltes Anführungszeichen links
- trans.put("”", ""); //doppeltes Anführungszeichen rechts
- trans.put("„", ""); //doppeltes low-9-Zeichen rechts
- trans.put("†", ""); //Kreuz
- trans.put("‡", ""); //Doppelkreuz
- trans.put("‰", ""); //zu tausend
- trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links
- trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts
- }
-
-
- public htmlFilterAbstractScraper(HashSet tags0, HashSet tags1) {
- this.tags0 = tags0;
- this.tags1 = tags1;
- }
-
- public boolean isTag0(String tag) {
- return (tags0 != null) && (tags0.contains(tag));
- }
-
- public boolean isTag1(String tag) {
- return (tags1 != null) && (tags1.contains(tag));
- }
-
- //the 'missing' method that shall be implemented:
- public abstract void scrapeText(byte[] text);
-
- // the other methods must take into account to construct the return value correctly
- public void scrapeTag0(String tagname, Properties tagopts) {
- }
-
- public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
- }
-
- // string conversions
- private static String code_iso8859s(int c) {
- switch ((int) c & 0xff) {
-
- // german umlaute and ligaturen
- case 0xc4: return "AE"; case 0xd6: return "OE"; case 0xdc: return "UE";
- case 0xe4: return "ae"; case 0xf6: return "oe"; case 0xfc: return "ue";
- case 0xdf: return "ss";
-
- // accent on letters; i.e. french characters
- case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc5: return "A";
- case 0xc6: return "AE";
- case 0xc7: return "C";
- case 0xc8: case 0xc9: case 0xca: return "E";
- case 0xcc: case 0xcd: case 0xce: case 0xcf: return "I";
- case 0xd0: return "D";
- case 0xd1: return "N";
- case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd8: return "O";
- case 0xd7: return "x";
- case 0xd9: case 0xda: case 0xdb: return "U";
- case 0xdd: return "Y";
- case 0xde: return "p";
-
- case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe5: return "a";
- case 0xe6: return "ae";
- case 0xe7: return "c";
- case 0xe8: case 0xe9: case 0xea: return "e";
- case 0xec: case 0xed: case 0xee: case 0xef: return "i";
- case 0xf0: return "d";
- case 0xf1: return "n";
- case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf8: return "o";
- case 0xf7: return "%";
- case 0xf9: case 0xfa: case 0xfb: return "u";
- case 0xfd: case 0xff: return "y";
- case 0xfe: return "p";
-
- // special characters
- case 0xa4: return " euro ";
- default: return null;
- }
- }
-
- public static serverByteBuffer convertUmlaute(serverByteBuffer bb) {
- serverByteBuffer t = new serverByteBuffer(bb.length() + 20);
- int b0, b1, b2;
- String z;
- int i = 0;
- while (i < bb.length()) {
- b0 = bb.byteAt(i) & 0xff;
- // check utf-8 encoding
- if (b0 < 128) {
- t.append(b0);
- i++;
- } else {
- b1 = bb.byteAt(i + 1) & 0xff;
- if ((b0 > 0xbf) && (b0 < 0xe0)) {
- z = code_iso8859s(((b0 & 0x1f) << 0x6) | (b1 & 0x3f));
- i += 2;
- } else {
- b2 = bb.byteAt(i + 2) & 0xff;
- z = code_iso8859s(((b0 & 0xf) << 0xc) | ((b1 & 0x3f) << 0x6) | (b2 & 0x3f));
- i += 3;
- }
- if (z == null) t.append(b0); else t.append(z);
- }
- }
- return t;
- }
-
- private static byte[] transscript(byte[] code) {
- String t = (String) trans.get(new String(code));
- if (t == null) return new byte[0]; else return t.getBytes();
- }
-
- protected static serverByteBuffer transscriptAll(serverByteBuffer bb) {
- int p0 = 0, p1;
- byte[] t;
- while ((p0 = bb.indexOf((byte) '&', p0)) >= 0) {
- p1 = bb.indexOf((byte) ';', p0);
- if (p1 >= 0) {
- t = transscript(bb.getBytes(p0, p1 + 1));
- bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getBytes(p1 + 1));
- } else {
- bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).append(bb.getBytes(p0 + 1));
- }
- }
- t = null;
- return bb;
- }
-
- protected static serverByteBuffer stripAllTags(serverByteBuffer bb) {
- int p0 = 0, p1;
- while ((p0 = bb.indexOf(lb, p0)) >= 0) {
- p1 = bb.indexOf(rb, p0);
- if (p1 >= 0) {
- bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + 1).trim().append((byte) 32).append(new serverByteBuffer(bb.getBytes(p1 + 1)).trim());
- } else {
- bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).trim().append(new serverByteBuffer(bb.getBytes(p0 + 1)).trim());
- }
- }
- return bb.trim();
- }
-
- public static serverByteBuffer stripAll(serverByteBuffer bb) {
- //return stripAllTags(s);
- return convertUmlaute(transscriptAll(stripAllTags(bb)));
- }
-
- public void close() {
- // free resources
- tags0 = null;
- tags1 = null;
- }
-
- public void finalize() {
- close();
- }
-
-}
+// htmlFilterAbstractScraper.java
// ---------------------------
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
// last major change: 18.02.2004
//
// You agree that the Author(s) is (are) not responsible for cost,
// loss of data or any harm that may be caused by usage of this softare or
// this documentation. The usage of this software is on your own risk. The
// installation and usage (starting/running) of this software may allow other
// people or application to access your computer and any attached devices and
// is highly dependent on the configuration of the software which must be
// done by the user of the software;the author(s) is (are) also
// not responsible for proper configuration and usage of the software, even
// if provoked by documentation provided together with the software.
//
// THE SOFTWARE THAT FOLLOWS AS ART OF PROGRAMMING BELOW THIS SECTION
// IS PUBLISHED UNDER THE GPL AS DOCUMENTED IN THE FILE gpl.txt ASIDE THIS
// FILE AND AS IN http://www.gnu.org/licenses/gpl.txt
// ANY CHANGES TO THIS FILE ACCORDING TO THE GPL CAN BE DONE TO THE
// LINES THAT FOLLOWS THIS COPYRIGHT NOTICE HERE, BUT CHANGES MUST NOT
// BE DONE ABOVE OR INSIDE THE COPYRIGHT NOTICE. A RE-DISTRIBUTION
// MUST CONTAIN THE INTACT AND UNCHANGED COPYRIGHT NOTICE.
// CONTRIBUTIONS AND CHANGES TO THE PROGRAM CODE SHOULD BE MARKED AS SUCH.
package de.anomic.htmlFilter;
import java.util.HashSet;
import java.util.HashMap;
import java.util.Properties;
import de.anomic.server.serverByteBuffer;
public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
public static final byte lb = (byte) '<';
public static final byte rb = (byte) '>';
public static final byte sl = (byte) '/';
private HashSet tags0;
private HashSet tags1;
// define a translation table for html character codings
private static HashMap trans = new HashMap(300);
static {
trans.put(""", "\""); //Anführungszeichen oben
trans.put("&", "&"); //Ampersand-Zeichen, kaufmännisches Und
trans.put("<", "<"); //öffnende spitze Klammer
trans.put(">", ">"); //schließende spitze Klammer
trans.put(" ", " "); //Erzwungenes Leerzeichen
trans.put("¡", "!"); //umgekehrtes Ausrufezeichen
trans.put("¢", " cent "); //Cent-Zeichen
trans.put("£", " pound "); //Pfund-Zeichen
trans.put("¤", " currency "); //Währungs-Zeichen
trans.put("¥", " yen "); //Yen-Zeichen
trans.put("¦", " "); //durchbrochener Strich
trans.put("§", " paragraph "); //Paragraph-Zeichen
trans.put("¨", " "); //Pünktchen oben
trans.put("©", " copyright "); //Copyright-Zeichen
trans.put("ª", " "); //Ordinal-Zeichen weiblich
trans.put("«", " "); //angewinkelte Anführungszeichen links
trans.put("¬", " not "); //Verneinungs-Zeichen
trans.put("", "-"); //kurzer Trennstrich
trans.put("®", " trademark "); //Registriermarke-Zeichen
trans.put("¯", " "); //Überstrich
trans.put("°", " degree "); //Grad-Zeichen
trans.put("±", " +/- "); //Plusminus-Zeichen
trans.put("²", " square "); //Hoch-2-Zeichen
trans.put("³", " 3 "); //Hoch-3-Zeichen
trans.put("´", " "); //Acute-Zeichen
trans.put("µ", " micro "); //Mikro-Zeichen
trans.put("¶", " paragraph "); //Absatz-Zeichen
trans.put("·", " "); //Mittelpunkt
trans.put("¸", " "); //Häkchen unten
trans.put("¹", " "); //Hoch-1-Zeichen
trans.put("º", " degree "); //Ordinal-Zeichen männlich
trans.put("»", " "); //angewinkelte Anführungszeichen rechts
trans.put("¼", " quarter "); //ein Viertel
trans.put("½", " half "); //ein Halb
trans.put("¾", " 3/4 "); //drei Viertel
trans.put("¿", "?"); //umgekehrtes Fragezeichen
trans.put("À", "A"); //A mit Accent grave
trans.put("Á", "A"); //A mit Accent acute
trans.put("Â", "A"); //A mit Circumflex
trans.put("Ã", "A"); //A mit Tilde
trans.put("Ä", "Ae"); //A Umlaut
trans.put("Å", "A"); //A mit Ring
trans.put("Æ", "A"); //A mit legiertem E
trans.put("Ç", "C"); //C mit Häkchen
trans.put("È", "E"); //E mit Accent grave
trans.put("É", "E"); //E mit Accent acute
trans.put("Ê", "E"); //E mit Circumflex
trans.put("Ë", "E"); //E Umlaut
trans.put("Ì", "I"); //I mit Accent grave
trans.put("Í", "I"); //I mit Accent acute
trans.put("Î", "I"); //I mit Circumflex
trans.put("Ï", "I"); //I Umlaut
trans.put("Ð", "D"); //Eth (isländisch)
trans.put("Ñ", "N"); //N mit Tilde
trans.put("Ò", "O"); //O mit Accent grave
trans.put("Ó", "O"); //O mit Accent acute
trans.put("Ô", "O"); //O mit Circumflex
trans.put("Õ", "O"); //O mit Tilde
trans.put("Ö", "Oe"); //O Umlaut
trans.put("×", " times "); //Mal-Zeichen
trans.put("Ø", "O"); //O mit Schrägstrich
trans.put("Ù", "U"); //U mit Accent grave
trans.put("Ú", "U"); //U mit Accent acute
trans.put("Û", "U"); //U mit Circumflex
trans.put("Ü", "Ue"); //U Umlaut
trans.put("Ý", "Y"); //Y mit Accent acute
trans.put("Þ", "P"); //THORN (isländisch)
trans.put("ß", "ss"); //scharfes S
trans.put("à", "a"); //a mit Accent grave
trans.put("á", "a"); //a mit Accent acute
trans.put("â", "a"); //a mit Circumflex
trans.put("ã", "a"); //a mit Tilde
trans.put("ä", "ae"); //a Umlaut
trans.put("å", "a"); //a mit Ring
trans.put("æ", "a"); //a mit legiertem e
trans.put("ç", "c"); //c mit Häkchen
trans.put("è", "e"); //e mit Accent grave
trans.put("é", "e"); //e mit Accent acute
trans.put("ê", "e"); //e mit Circumflex
trans.put("ë", "e"); //e Umlaut
trans.put("ì", "i"); //i mit Accent grave
trans.put("í", "i"); //i mit Accent acute
trans.put("î", "i"); //i mit Circumflex
trans.put("ï", "i"); //i Umlaut
trans.put("ð", "d"); //eth (isländisch)
trans.put("ñ", "n"); //n mit Tilde
trans.put("ò", "o"); //o mit Accent grave
trans.put("ó", "o"); //o mit Accent acute
trans.put("ô", "o"); //o mit Circumflex
trans.put("õ", "o"); //o mit Tilde
trans.put("ö", "oe"); //o Umlaut
trans.put("÷", "%"); //Divisions-Zeichen
trans.put("ø", "o"); //o mit Schrägstrich
trans.put("ù", "u"); //u mit Accent grave
trans.put("ú", "u"); //u mit Accent acute
trans.put("û", "u"); //u mit Circumflex
trans.put("ü", "ue"); //u Umlaut
trans.put("ý", "y"); //y mit Accent acute
trans.put("þ", "p"); //thorn (isländisch)
trans.put("ÿ", "y"); //y Umlaut
trans.put("Α", " Alpha "); //Alpha groß
trans.put("α", " alpha "); //alpha klein
trans.put("Β", " Beta "); //Beta groß
trans.put("β", " beta "); //beta klein
trans.put("Γ", " Gamma "); //Gamma groß
trans.put("γ", " gamma "); //gamma klein
trans.put("Δ", " Delta "); //Delta groß
trans.put("δ", " delta "); //delta klein
trans.put("Ε", " Epsilon "); //Epsilon groß
trans.put("ε", " epsilon "); //epsilon klein
trans.put("Ζ", " Zeta "); //Zeta groß
trans.put("ζ", " zeta "); //zeta klein
trans.put("Η", " Eta "); //Eta groß
trans.put("η", " eta "); //eta klein
trans.put("Θ", " Theta "); //Theta groß
trans.put("θ", " theta "); //theta klein
trans.put("Ι", " Iota "); //Iota groß
trans.put("ι", " iota "); //iota klein
trans.put("Κ", " Kappa "); //Kappa groß
trans.put("κ", " kappa "); //kappa klein
trans.put("Λ", " Lambda "); //Lambda groß
trans.put("λ", " lambda "); //lambda klein
trans.put("Μ", " Mu "); //Mu groß
trans.put("μ", " mu "); //mu klein
trans.put("Ν", " Nu "); //Nu groß
trans.put("ν", " nu "); //nu klein
trans.put("Ξ", " Xi "); //Xi groß
trans.put("ξ", " xi "); //xi klein
trans.put("Ο", " Omicron "); //Omicron groß
trans.put("ο", " omicron "); //omicron klein
trans.put("Π", " Pi "); //Pi groß
trans.put("π", " pi "); //pi klein
trans.put("Ρ", " Rho "); //Rho groß
trans.put("ρ", " rho "); //rho klein
trans.put("Σ", " Sigma "); //Sigma groß
trans.put("ς", " sigma "); //sigmaf klein
trans.put("σ", " sigma "); //sigma klein
trans.put("Τ", " Tau "); //Tau groß
trans.put("τ", " tau "); //tau klein
trans.put("Υ", " Ypsilon "); //Upsilon groß
trans.put("υ", " ypsilon "); //upsilon klein
trans.put("Φ", " Phi "); //Phi groß
trans.put("φ", " phi "); //phi klein
trans.put("Χ", " Chi "); //Chi groß
trans.put("χ", " chi "); //chi klein
trans.put("Ψ", " Psi "); //Psi groß
trans.put("ψ", " psi "); //psi klein
trans.put("Ω", " Omega "); //Omega groß
trans.put("ω", " omega "); //omega klein
trans.put("ϑ", " theta "); //theta Symbol
trans.put("ϒ", " ypsilon "); //upsilon mit Haken
trans.put("ϖ", " pi "); //pi Symbol
trans.put("∀", " for all "); //für alle
trans.put("∂", " part of "); //teilweise
trans.put("∃", " exists "); //existiert
trans.put("∅", " null "); //leer
trans.put("∇", " nabla "); //nabla
trans.put("∈", " element of "); //Element von
trans.put("∉", " not element of "); //kein Element von
trans.put("∋", " contains "); //enthält als Element
trans.put("∏", " product "); //Produkt
trans.put("∑", " sum "); //Summe
trans.put("−", " minus "); //minus
trans.put("∗", " times "); //Asterisk
trans.put("√", " sqare root "); //Quadratwurzel
trans.put("∝", " proportional to "); //proportional zu
trans.put("∞", " unlimited "); //unendlich
trans.put("∠", " angle "); //Winkel
trans.put("∧", " and "); //und
trans.put("∨", " or "); //oder
trans.put("∩", " "); //Schnittpunkt
trans.put("∪", " unity "); //Einheit
trans.put("∫", " integral "); //Integral
trans.put("∴", " cause "); //deshalb
trans.put("∼", " similar to "); //ähnlich wie
trans.put("≅", " equal "); //annähernd gleich
trans.put("≈", " equal "); //beinahe gleich
trans.put("≠", " not equal "); //ungleich
trans.put("≡", " identical "); //identisch mit
trans.put("≤", " smaller or equal than "); //kleiner gleich
trans.put("≥", " greater or equal than "); //größer gleich
trans.put("⊂", " subset of "); //Untermenge von
trans.put("⊃", " superset of "); //Obermenge von
trans.put("⊄", " not subset of "); //keine Untermenge von
trans.put("⊆", ""); //Untermenge von oder gleich mit
trans.put("⊇", ""); //Obermenge von oder gleich mit
trans.put("⊕", ""); //Direktsumme
trans.put("⊗", ""); //Vektorprodukt
trans.put("⊥", ""); //senkrecht zu
trans.put("⋅", ""); //Punkt-Operator
trans.put("◊", ""); //Raute
trans.put("⌈", ""); //links oben
trans.put("⌉", ""); //rechts oben
trans.put("⌊", ""); //links unten
trans.put("⌋", ""); //rechts unten
trans.put("〈", ""); //spitze Klammer links
trans.put("〉", ""); //spitze Klammer rechts
trans.put("←", ""); //Pfeil links
trans.put("↑", ""); //Pfeil oben
trans.put("→", ""); //Pfeil rechts
trans.put("↓", ""); //Pfeil unten
trans.put("↔", ""); //Pfeil links/rechts
trans.put("↵", ""); //Pfeil unten-Knick-links
trans.put("⇐", ""); //Doppelpfeil links
trans.put("⇑", ""); //Doppelpfeil oben
trans.put("⇒", ""); //Doppelpfeil rechts
trans.put("⇓", ""); //Doppelpfeil unten
trans.put("⇔", ""); //Doppelpfeil links/rechts
trans.put("•", ""); //Bullet-Zeichen
trans.put("…", ""); //Horizontale Ellipse
trans.put("′", ""); //Minutenzeichen
trans.put("‾", ""); //Überstrich
trans.put("⁄", ""); //Bruchstrich
trans.put("℘", ""); //Weierstrass p
trans.put("ℑ", ""); //Zeichen für "imaginär"
trans.put("ℜ", ""); //Zeichen für "real"
trans.put("™", ""); //Trademark-Zeichen
trans.put("€", ""); //Euro-Zeichen
trans.put("ℵ", ""); //Alef-Symbol
trans.put("♠", ""); //Pik-Zeichen
trans.put("♣", ""); //Kreuz-Zeichen
trans.put("♥", ""); //Herz-Zeichen
trans.put("♦", ""); //Karo-Zeichen
trans.put(" ", ""); //Leerzeichen Breite n
trans.put(" ", ""); //Leerzeichen Breite m
trans.put(" ", ""); //Schmales Leerzeichen
trans.put("", ""); //null breiter Nichtverbinder
trans.put("", ""); //null breiter Verbinder
trans.put("", ""); //links-nach-rechts-Zeichen
trans.put("", ""); //rechts-nach-links-Zeichen
trans.put("–", ""); //Gedankenstrich Breite n
trans.put("—", ""); //Gedankenstrich Breite m
trans.put("‘", ""); //einfaches Anführungszeichen links
trans.put("’", ""); //einfaches Anführungszeichen rechts
trans.put("‚", ""); //einfaches low-9-Zeichen
trans.put("“", ""); //doppeltes Anführungszeichen links
trans.put("”", ""); //doppeltes Anführungszeichen rechts
trans.put("„", ""); //doppeltes low-9-Zeichen rechts
trans.put("†", ""); //Kreuz
trans.put("‡", ""); //Doppelkreuz
trans.put("‰", ""); //zu tausend
trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links
trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts
}
public htmlFilterAbstractScraper(HashSet tags0, HashSet tags1) {
this.tags0 = tags0;
this.tags1 = tags1;
}
public boolean isTag0(String tag) {
return (tags0 != null) && (tags0.contains(tag));
}
public boolean isTag1(String tag) {
return (tags1 != null) && (tags1.contains(tag));
}
//the 'missing' method that shall be implemented:
public abstract void scrapeText(byte[] text);
// the other methods must take into account to construct the return value correctly
public void scrapeTag0(String tagname, Properties tagopts) {
}
public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
}
// string conversions
private static String code_iso8859s(int c) {
switch ((int) c & 0xff) {
// german umlaute and ligaturen
case 0xc4: return "AE"; case 0xd6: return "OE"; case 0xdc: return "UE";
case 0xe4: return "ae"; case 0xf6: return "oe"; case 0xfc: return "ue";
case 0xdf: return "ss";
// accent on letters; i.e. french characters
case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc5: return "A";
case 0xc6: return "AE";
case 0xc7: return "C";
case 0xc8: case 0xc9: case 0xca: return "E";
case 0xcc: case 0xcd: case 0xce: case 0xcf: return "I";
case 0xd0: return "D";
case 0xd1: return "N";
case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd8: return "O";
case 0xd7: return "x";
case 0xd9: case 0xda: case 0xdb: return "U";
case 0xdd: return "Y";
case 0xde: return "p";
case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe5: return "a";
case 0xe6: return "ae";
case 0xe7: return "c";
case 0xe8: case 0xe9: case 0xea: return "e";
case 0xec: case 0xed: case 0xee: case 0xef: return "i";
case 0xf0: return "d";
case 0xf1: return "n";
case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf8: return "o";
case 0xf7: return "%";
case 0xf9: case 0xfa: case 0xfb: return "u";
case 0xfd: case 0xff: return "y";
case 0xfe: return "p";
// special characters
case 0xa4: return " euro ";
default: return null;
}
}
public static serverByteBuffer convertUmlaute(serverByteBuffer bb) {
serverByteBuffer t = new serverByteBuffer(bb.length() + 20);
int b0, b1, b2;
String z;
int i = 0;
while (i < bb.length()) {
b0 = bb.byteAt(i) & 0xff;
// check utf-8 encoding
if ((b0 < 128) || (i + 1 == bb.length())) {
t.append(b0);
i++;
} else {
b1 = bb.byteAt(i + 1) & 0xff;
if (b1 > 0x3f) {
z = code_iso8859s(b0);
i++;
} else if ((b0 > 0xbf) && (b0 < 0xe0)) {
z = code_iso8859s(((b0 & 0x1f) << 0x6) | (b1 & 0x3f));
i += 2;
} else {
if (i + 2 >= bb.length()) {
z = null;
i++;
} else {
b2 = bb.byteAt(i + 2) & 0xff;
if (b2 > 0x3f) {
z = code_iso8859s(b0);
i++;
} else {
z = code_iso8859s(((b0 & 0xf) << 0xc) | ((b1 & 0x3f) << 0x6) | (b2 & 0x3f));
i += 3;
}
}
}
if (z == null) t.append(b0); else t.append(z);
}
}
return t;
}
private static byte[] transscript(byte[] code) {
String t = (String) trans.get(new String(code));
if (t == null) return new byte[0]; else return t.getBytes();
}
protected static serverByteBuffer transscriptAll(serverByteBuffer bb) {
int p0 = 0, p1;
byte[] t;
while ((p0 = bb.indexOf((byte) '&', p0)) >= 0) {
p1 = bb.indexOf((byte) ';', p0);
if (p1 >= 0) {
t = transscript(bb.getBytes(p0, p1 + 1));
bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getBytes(p1 + 1));
} else {
bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).append(bb.getBytes(p0 + 1));
}
}
t = null;
return bb;
}
protected static serverByteBuffer stripAllTags(serverByteBuffer bb) {
int p0 = 0, p1;
while ((p0 = bb.indexOf(lb, p0)) >= 0) {
p1 = bb.indexOf(rb, p0);
if (p1 >= 0) {
bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + 1).trim().append((byte) 32).append(new serverByteBuffer(bb.getBytes(p1 + 1)).trim());
} else {
bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).trim().append(new serverByteBuffer(bb.getBytes(p0 + 1)).trim());
}
}
return bb.trim();
}
public static serverByteBuffer stripAll(serverByteBuffer bb) {
//return stripAllTags(s);
return convertUmlaute(transscriptAll(stripAllTags(bb)));
}
public void close() {
// free resources
tags0 = null;
tags1 = null;
}
public void finalize() {
close();
}
}
\ No newline at end of file
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index a4751847d..3fb292c10 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -41,6 +41,7 @@
package de.anomic.htmlFilter;
import java.net.URL;
+import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
@@ -178,4 +179,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
System.out.println("TEXT :" + new String(text.getBytes()));
}
+
+ public static void main(String[] args) {
+ String test = "Nokia kürzt bei Forschung und Entwicklung";
+ try {
+ htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL("http://localhost"));
+ scraper.scrapeText(test.getBytes());
+ System.out.println(new String(scraper.getText()));
+ } catch (MalformedURLException e) {}
+ }
+
}
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index ab2c8811c..2ad248f2a 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -149,8 +149,8 @@ public final class plasmaParser {
* @see #initMediaExt(String)
*/
static {
- initMediaExt(extString2extList("swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," +
- "sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj"));
+ initMediaExt(extString2extList("swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," +
+ "sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj"));
/* ===================================================
* initializing the parser object pool
@@ -383,21 +383,21 @@ public final class plasmaParser {
private static void loadEnabledParserList() {
// loading a list of availabe parser from file
- Properties prop = new Properties();
+ Properties prop = new Properties();
BufferedInputStream bufferedIn = null;
- try {
- prop.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("yacy.parser"))));
- } catch (IOException e) {
- System.err.println("ERROR: yacy.parser not found in settings path");
- } finally {
+ try {
+ prop.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("yacy.parser"))));
+ } catch (IOException e) {
+ System.err.println("ERROR: yacy.parser not found in settings path");
+ } finally {
if (bufferedIn != null) try{ bufferedIn.close(); }catch(Exception e){}
}
-
+
// enable them ...
setEnabledParserList(prop.keySet());
- }
-
- private static void loadAvailableParserList() {
+ }
+
+ private static void loadAvailableParserList() {
try {
plasmaParser.availableParserList.clear();
@@ -405,24 +405,24 @@ public final class plasmaParser {
String javaClassPath = System.getProperty("java.class.path");
// getting the current package name
- String plasmaParserPkgName = plasmaParser.class.getPackage().getName() + ".parser";
+ String plasmaParserPkgName = plasmaParser.class.getPackage().getName() + ".parser";
serverLog.logInfo("PARSER","Searching for additional content parsers in package " + plasmaParserPkgName);
-
+
// getting an uri to the parser subpackage
- String packageURI = plasmaParser.class.getResource("/"+plasmaParserPkgName.replace('.','/')).toString();
- serverLog.logDebug("PARSER", "Parser directory is " + packageURI);
-
+ String packageURI = plasmaParser.class.getResource("/"+plasmaParserPkgName.replace('.','/')).toString();
+ serverLog.logDebug("PARSER", "Parser directory is " + packageURI);
+
// open the parser directory
- File parserDir = new File(new URI(packageURI));
+ File parserDir = new File(new URI(packageURI));
if ((parserDir == null) || (!parserDir.exists()) || (!parserDir.isDirectory())) return;
- /*
- * loop through all subdirectories and test if we can
+ /*
+ * loop through all subdirectories and test if we can
* find an additional parser class
*/
File[] parserDirectories = parserDir.listFiles(parserDirectoryFilter);
if (parserDirectories == null) return;
- for (int parserDirNr=0; parserDirNr< parserDirectories.length; parserDirNr++) {
+ for (int parserDirNr=0; parserDirNr< parserDirectories.length; parserDirNr++) {
File currentDir = parserDirectories[parserDirNr];
serverLog.logDebug("PARSER", "Searching in directory " + currentDir.toString());
String[] parserClasses = currentDir.list(parserFileNameFilter);
@@ -432,7 +432,7 @@ public final class plasmaParser {
serverLog.logDebug("PARSER", "Testing parser class " + parserClasses[parserNr]);
String className = parserClasses[parserNr].substring(0,parserClasses[parserNr].indexOf(".class"));
String fullClassName = plasmaParserPkgName + "." + currentDir.getName() + "." + className;
- try {
+ try {
// trying to load the parser class by its name
Class parserClass = Class.forName(fullClassName);
Object theParser = parserClass.newInstance();
@@ -446,7 +446,7 @@ public final class plasmaParser {
throw new ParserException("Missing dependency detected: '" + neededLibx[libxId] + "'.");
}
}
- }
+ }
// loading the list of mime-types that are supported by this parser class
Hashtable supportedMimeTypes = ((Parser)theParser).getSupportedMimeTypes();
@@ -456,31 +456,31 @@ public final class plasmaParser {
availableParserList.put(mimeType,fullClassName);
serverLog.logInfo("PARSER", "Found functional parser for mimeType '" + mimeType + "'.");
}
-
- } catch (Exception e) { /* we can ignore this for the moment */
+
+ } catch (Exception e) { /* we can ignore this for the moment */
serverLog.logWarning("PARSER", "Parser '" + className + "' doesn't work correctly and will be ignored.\n [" + e.getClass().getName() + "]: " + e.getMessage());
- } catch (Error e) { /* we can ignore this for the moment */
+ } catch (Error e) { /* we can ignore this for the moment */
serverLog.logWarning("PARSER", "Parser '" + className + "' doesn't work correctly and will be ignored.\n [" + e.getClass().getName() + "]: " + e.getMessage());
}
}
- }
+ }
} catch (Exception e) {
serverLog.logError("PARSER", "Unable to determine all installed parsers. " + e.getMessage());
- }
- }
-
- public void close() {
+ }
+ }
+
+ public void close() {
// clearing the parser list
synchronized (this.enabledParserList) {
- this.enabledParserList.clear();
- }
+ this.enabledParserList.clear();
+ }
// closing the parser object pool
- try {
- this.theParserPool.close();
- } catch (Exception e) { }
- }
+ try {
+ this.theParserPool.close();
+ } catch (Exception e) { }
+ }
public plasmaParserDocument parseSource(URL location, String mimeType, byte[] source) {
@@ -498,7 +498,6 @@ public final class plasmaParser {
// ... otherwise we make a html scraper and transformer
htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
-
hfos.write(source);
hfos.close();
return transformScraper(location, mimeType, scraper);
@@ -660,22 +659,24 @@ public final class plasmaParser {
return v;
}
- public static void main(String[] args) {
- //javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java
- //java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out
- try {
- File in = new File(args[0]);
- File out = new File(args[1]);
- plasmaParser theParser = new plasmaParser();
+ public static void main(String[] args) {
+ //javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java
+ //java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out
+ try {
+ File in = new File(args[0]);
+ //File out = new File(args[1]);
+ plasmaParser theParser = new plasmaParser();
theParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain");
theParser.initParseableMimeTypes("application/atom+xml,application/gzip,application/java-archive,application/msword,application/octet-stream,application/pdf,application/rdf+xml,application/rss+xml,application/rtf,application/x-gzip,application/x-tar,application/xml,application/zip,text/rss,text/rtf,text/xml,application/x-bzip2");
- FileInputStream theInput = new FileInputStream(in);
- ByteArrayOutputStream theOutput = new ByteArrayOutputStream();
- serverFileUtils.copy(theInput, theOutput);
- plasmaParserDocument document = theParser.parseSource(new URL("http://brain/~theli/test.pdf"), null, theOutput.toByteArray());
- //plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray());
- byte[] theText = document.getText();
- serverFileUtils.write(theText, out);
+ FileInputStream theInput = new FileInputStream(in);
+ ByteArrayOutputStream theOutput = new ByteArrayOutputStream();
+ serverFileUtils.copy(theInput, theOutput);
+ plasmaParserDocument document = theParser.parseSource(new URL("http://brain/~theli/test.pdf"), null, theOutput.toByteArray());
+ //plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray());
+ //byte[] theText = document.getText();
+ //serverFileUtils.write(theText, out);
+ String[] sentences = document.getSentences();
+ for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]);
} catch (Exception e) {
e.printStackTrace();
}
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index a51785167..d3bdbfdf3 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -105,6 +105,7 @@ public class plasmaSnippetCache {
}
public result retrieve(URL url, Set queryhashes, boolean fetchOnline) {
+ // heise = "0OQUNU3JSs05"
if (queryhashes.size() == 0) {
//System.out.println("found no queryhashes for url retrieve " + url);
return new result(null, SOURCE_ERROR, "no query hashes given");
@@ -250,7 +251,7 @@ public class plasmaSnippetCache {
} catch (IOException e) {}
if (header == null) {
- String filename = url.getFile();
+ String filename = cacheManager.getCachePath(url).getName();
int p = filename.lastIndexOf('.');
if ((p < 0) ||
((p >= 0) && (plasmaParser.supportedFileExtContains(filename.substring(p + 1))))) {
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 537fdc58c..1c4ec2b46 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -371,6 +371,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
deployThread("99_indexcachemigration", "index cache migration", "migration of index cache data structures 0.37 -> 0.38",
new serverInstantThread(classicCache, "oneStepMigration", "size"), 30000);
}
+
+ // test routine for snippet fetch
+ // url = /www.heise.de/mobil/newsticker/meldung/mail/54980
+ Set query = new HashSet(); query.add("0OQUNU3JSs05"); // 'heise'
+ //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/mobil/newsticker/meldung/mail/54980"), query, true);
+ plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/security/news/foren/go.shtml?read=1&msg_id=7301419&forum_id=72721"), query, true);
}
private static String ppRamString(int bytes) {
diff --git a/source/de/anomic/server/serverFileUtils.java b/source/de/anomic/server/serverFileUtils.java
index 158fd147a..cd19d0d72 100644
--- a/source/de/anomic/server/serverFileUtils.java
+++ b/source/de/anomic/server/serverFileUtils.java
@@ -90,12 +90,12 @@ public final class serverFileUtils {
FileInputStream fis = null;
FileOutputStream fos = null;
try {
- fis = new FileInputStream(source);
- fos = new FileOutputStream(dest);
- copy(fis, fos);
+ fis = new FileInputStream(source);
+ fos = new FileOutputStream(dest);
+ copy(fis, fos);
} finally {
if (fis != null) try {fis.close();} catch (Exception e) {}
- if (fos != null) try {fos.close();} catch (Exception e) {}
+ if (fos != null) try {fos.close();} catch (Exception e) {}
}
}
@@ -107,16 +107,16 @@ public final class serverFileUtils {
}
public static byte[] read(File source) throws IOException {
- byte[] buffer = new byte[(int) source.length()];
- InputStream fis = null;
- try {
- fis = new FileInputStream(source);
- int p = 0, c;
- while ((c = fis.read(buffer, p, buffer.length - p)) > 0) p += c;
- } finally {
+ byte[] buffer = new byte[(int) source.length()];
+ InputStream fis = null;
+ try {
+ fis = new FileInputStream(source);
+ int p = 0, c;
+ while ((c = fis.read(buffer, p, buffer.length - p)) > 0) p += c;
+ } finally {
if (fis != null) try { fis.close(); } catch (Exception e) {}
- }
- return buffer;
+ }
+ return buffer;
}
public static byte[] readAndZip(File source) throws IOException {
diff --git a/source/yacy.java b/source/yacy.java
index 2af5914c4..69cdaed4b 100644
--- a/source/yacy.java
+++ b/source/yacy.java
@@ -148,6 +148,10 @@ public final class yacy {
plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf");
+ // hardcoded, forced, temporary value-migration
+ sb.setConfig("htTemplatePath", "htroot/env/templates");
+ sb.setConfig("parseableExt", "html,htm,txt,php,shtml,asp");
+
// if we are running an SVN version, we try to detect the used svn revision now ...
if (vString.equals("@" + "REPL_VERSION" + "@")) {
Properties buildProp = new Properties();
@@ -188,9 +192,6 @@ public final class yacy {
if (timeout < 60000) timeout = 60000;
int maxSessions = Integer.parseInt(sb.getConfig("httpdMaxSessions", "100"));
- // hardcoded, forced, temporary value-migration
- sb.setConfig("htTemplatePath", "htroot/env/templates");
-
// create some directories
File htRootPath = new File(sb.getRootPath(), sb.getConfig("htRootPath", "htroot"));
File htDocsPath = new File(sb.getRootPath(), sb.getConfig("htDocsPath", "DATA/HTDOCS"));
diff --git a/yacy.init b/yacy.init
index 74b8dbb76..c0b620b21 100644
--- a/yacy.init
+++ b/yacy.init
@@ -100,7 +100,7 @@ parseableMimeTypes=
# this is important to recognize - tags as not-html reference
# These files will be excluded from indexing _(Please keep extensions in alphabetical order)_
mediaExt=ace,arj,asf,avi,bin,bz2,css,deb,doc,dmg,gif,gz,hqx,img,iso,jar,jpe,jpg,jpeg,mpeg,mov,mp3,mpg,ogg,png,pdf,ppt,ps,ram,rar,rm,rpm,sit,swf,sxc,sxd,sxi,sxw,tar,tgz,torrent,wmv,xcf,xls,zip
-parseableExt=html,htm,txt
+parseableExt=html,htm,txt,php,shtml,asp
# Promotion Strings
# These strings appear in the Web Mask of the YACY search client