diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java index 1b83dc28b..159ca48e0 100644 --- a/htroot/CacheAdmin_p.java +++ b/htroot/CacheAdmin_p.java @@ -121,8 +121,9 @@ public class CacheAdmin_p { else { htmlFilterContentScraper scraper = new htmlFilterContentScraper(url); OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); - plasmaParserDocument document = switchboard.parser.transformScraper(url, "text/html", scraper); serverFileUtils.copy(file, os); + os.flush(); + plasmaParserDocument document = switchboard.parser.transformScraper(url, "text/html", scraper); info += "HEADLINE:
" + scraper.getHeadline() + "

"; info += "HREF:
" + formatAnchor(document.getHyperlinks()) + "
"; info += "MEDIA:
" + formatAnchor(document.getMedialinks()) + "
"; @@ -130,7 +131,7 @@ public class CacheAdmin_p { info += "TEXT:
" + new String(scraper.getText()) + "
"; info += "LINES:
"; String[] sentences = document.getSentences(); - for (int i = 0; i < sentences.length; i++) info += sentences + "
"; + for (int i = 0; i < sentences.length; i++) info += sentences[i] + "
"; info += "

"; } } catch (Exception e) { diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java index a91936ea9..591ca5881 100644 --- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java @@ -1,434 +1 @@ -// htmlFilterAbstractScraper.java -// --------------------------- -// (C) by Michael Peter Christen; mc@anomic.de -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004 -// last major change: 18.02.2004 -// -// You agree that the Author(s) is (are) not responsible for cost, -// loss of data or any harm that may be caused by usage of this softare or -// this documentation. The usage of this software is on your own risk. The -// installation and usage (starting/running) of this software may allow other -// people or application to access your computer and any attached devices and -// is highly dependent on the configuration of the software which must be -// done by the user of the software;the author(s) is (are) also -// not responsible for proper configuration and usage of the software, even -// if provoked by documentation provided together with the software. -// -// THE SOFTWARE THAT FOLLOWS AS ART OF PROGRAMMING BELOW THIS SECTION -// IS PUBLISHED UNDER THE GPL AS DOCUMENTED IN THE FILE gpl.txt ASIDE THIS -// FILE AND AS IN http://www.gnu.org/licenses/gpl.txt -// ANY CHANGES TO THIS FILE ACCORDING TO THE GPL CAN BE DONE TO THE -// LINES THAT FOLLOWS THIS COPYRIGHT NOTICE HERE, BUT CHANGES MUST NOT -// BE DONE ABOVE OR INSIDE THE COPYRIGHT NOTICE. A RE-DISTRIBUTION -// MUST CONTAIN THE INTACT AND UNCHANGED COPYRIGHT NOTICE. -// CONTRIBUTIONS AND CHANGES TO THE PROGRAM CODE SHOULD BE MARKED AS SUCH. - -package de.anomic.htmlFilter; - -import java.util.HashSet; -import java.util.HashMap; -import java.util.Properties; - -import de.anomic.server.serverByteBuffer; - -public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { - - public static final byte lb = (byte) '<'; - public static final byte rb = (byte) '>'; - public static final byte sl = (byte) '/'; - - private HashSet tags0; - private HashSet tags1; - - // define a translation table for html character codings - private static HashMap trans = new HashMap(300); - static { - trans.put(""", "\""); //Anführungszeichen oben - trans.put("&", "&"); //Ampersand-Zeichen, kaufmännisches Und - trans.put("<", "<"); //öffnende spitze Klammer - trans.put(">", ">"); //schließende spitze Klammer - trans.put(" ", " "); //Erzwungenes Leerzeichen - trans.put("¡", "!"); //umgekehrtes Ausrufezeichen - trans.put("¢", " cent "); //Cent-Zeichen - trans.put("£", " pound "); //Pfund-Zeichen - trans.put("¤", " currency "); //Währungs-Zeichen - trans.put("¥", " yen "); //Yen-Zeichen - trans.put("¦", " "); //durchbrochener Strich - trans.put("§", " paragraph "); //Paragraph-Zeichen - trans.put("¨", " "); //Pünktchen oben - trans.put("©", " copyright "); //Copyright-Zeichen - trans.put("ª", " "); //Ordinal-Zeichen weiblich - trans.put("«", " "); //angewinkelte Anführungszeichen links - trans.put("¬", " not "); //Verneinungs-Zeichen - trans.put("­", "-"); //kurzer Trennstrich - trans.put("®", " trademark "); //Registriermarke-Zeichen - trans.put("¯", " "); //Überstrich - trans.put("°", " degree "); //Grad-Zeichen - trans.put("±", " +/- "); //Plusminus-Zeichen - trans.put("²", " square "); //Hoch-2-Zeichen - trans.put("³", " 3 "); //Hoch-3-Zeichen - trans.put("´", " "); //Acute-Zeichen - trans.put("µ", " micro "); //Mikro-Zeichen - trans.put("¶", " paragraph "); //Absatz-Zeichen - trans.put("·", " "); //Mittelpunkt - trans.put("¸", " "); //Häkchen unten - trans.put("¹", " "); //Hoch-1-Zeichen - trans.put("º", " degree "); //Ordinal-Zeichen männlich - trans.put("»", " "); //angewinkelte Anführungszeichen rechts - trans.put("¼", " quarter "); //ein Viertel - trans.put("½", " half "); //ein Halb - trans.put("¾", " 3/4 "); //drei Viertel - trans.put("¿", "?"); //umgekehrtes Fragezeichen - trans.put("À", "A"); //A mit Accent grave - trans.put("Á", "A"); //A mit Accent acute - trans.put("Â", "A"); //A mit Circumflex - trans.put("Ã", "A"); //A mit Tilde - trans.put("Ä", "Ae"); //A Umlaut - trans.put("Å", "A"); //A mit Ring - trans.put("Æ", "A"); //A mit legiertem E - trans.put("Ç", "C"); //C mit Häkchen - trans.put("È", "E"); //E mit Accent grave - trans.put("É", "E"); //E mit Accent acute - trans.put("Ê", "E"); //E mit Circumflex - trans.put("Ë", "E"); //E Umlaut - trans.put("Ì", "I"); //I mit Accent grave - trans.put("Í", "I"); //I mit Accent acute - trans.put("Î", "I"); //I mit Circumflex - trans.put("Ï", "I"); //I Umlaut - trans.put("Ð", "D"); //Eth (isländisch) - trans.put("Ñ", "N"); //N mit Tilde - trans.put("Ò", "O"); //O mit Accent grave - trans.put("Ó", "O"); //O mit Accent acute - trans.put("Ô", "O"); //O mit Circumflex - trans.put("Õ", "O"); //O mit Tilde - trans.put("Ö", "Oe"); //O Umlaut - trans.put("×", " times "); //Mal-Zeichen - trans.put("Ø", "O"); //O mit Schrägstrich - trans.put("Ù", "U"); //U mit Accent grave - trans.put("Ú", "U"); //U mit Accent acute - trans.put("Û", "U"); //U mit Circumflex - trans.put("Ü", "Ue"); //U Umlaut - trans.put("Ý", "Y"); //Y mit Accent acute - trans.put("Þ", "P"); //THORN (isländisch) - trans.put("ß", "ss"); //scharfes S - trans.put("à", "a"); //a mit Accent grave - trans.put("á", "a"); //a mit Accent acute - trans.put("â", "a"); //a mit Circumflex - trans.put("ã", "a"); //a mit Tilde - trans.put("ä", "ae"); //a Umlaut - trans.put("å", "a"); //a mit Ring - trans.put("æ", "a"); //a mit legiertem e - trans.put("ç", "c"); //c mit Häkchen - trans.put("è", "e"); //e mit Accent grave - trans.put("é", "e"); //e mit Accent acute - trans.put("ê", "e"); //e mit Circumflex - trans.put("ë", "e"); //e Umlaut - trans.put("ì", "i"); //i mit Accent grave - trans.put("í", "i"); //i mit Accent acute - trans.put("î", "i"); //i mit Circumflex - trans.put("ï", "i"); //i Umlaut - trans.put("ð", "d"); //eth (isländisch) - trans.put("ñ", "n"); //n mit Tilde - trans.put("ò", "o"); //o mit Accent grave - trans.put("ó", "o"); //o mit Accent acute - trans.put("ô", "o"); //o mit Circumflex - trans.put("õ", "o"); //o mit Tilde - trans.put("ö", "oe"); //o Umlaut - trans.put("÷", "%"); //Divisions-Zeichen - trans.put("ø", "o"); //o mit Schrägstrich - trans.put("ù", "u"); //u mit Accent grave - trans.put("ú", "u"); //u mit Accent acute - trans.put("û", "u"); //u mit Circumflex - trans.put("ü", "ue"); //u Umlaut - trans.put("ý", "y"); //y mit Accent acute - trans.put("þ", "p"); //thorn (isländisch) - trans.put("ÿ", "y"); //y Umlaut - trans.put("Α", " Alpha "); //Alpha groß - trans.put("α", " alpha "); //alpha klein - trans.put("Β", " Beta "); //Beta groß - trans.put("β", " beta "); //beta klein - trans.put("Γ", " Gamma "); //Gamma groß - trans.put("γ", " gamma "); //gamma klein - trans.put("Δ", " Delta "); //Delta groß - trans.put("δ", " delta "); //delta klein - trans.put("Ε", " Epsilon "); //Epsilon groß - trans.put("ε", " epsilon "); //epsilon klein - trans.put("Ζ", " Zeta "); //Zeta groß - trans.put("ζ", " zeta "); //zeta klein - trans.put("Η", " Eta "); //Eta groß - trans.put("η", " eta "); //eta klein - trans.put("Θ", " Theta "); //Theta groß - trans.put("θ", " theta "); //theta klein - trans.put("Ι", " Iota "); //Iota groß - trans.put("ι", " iota "); //iota klein - trans.put("Κ", " Kappa "); //Kappa groß - trans.put("κ", " kappa "); //kappa klein - trans.put("Λ", " Lambda "); //Lambda groß - trans.put("λ", " lambda "); //lambda klein - trans.put("Μ", " Mu "); //Mu groß - trans.put("μ", " mu "); //mu klein - trans.put("Ν", " Nu "); //Nu groß - trans.put("ν", " nu "); //nu klein - trans.put("Ξ", " Xi "); //Xi groß - trans.put("ξ", " xi "); //xi klein - trans.put("Ο", " Omicron "); //Omicron groß - trans.put("ο", " omicron "); //omicron klein - trans.put("Π", " Pi "); //Pi groß - trans.put("π", " pi "); //pi klein - trans.put("Ρ", " Rho "); //Rho groß - trans.put("ρ", " rho "); //rho klein - trans.put("Σ", " Sigma "); //Sigma groß - trans.put("ς", " sigma "); //sigmaf klein - trans.put("σ", " sigma "); //sigma klein - trans.put("Τ", " Tau "); //Tau groß - trans.put("τ", " tau "); //tau klein - trans.put("Υ", " Ypsilon "); //Upsilon groß - trans.put("υ", " ypsilon "); //upsilon klein - trans.put("Φ", " Phi "); //Phi groß - trans.put("φ", " phi "); //phi klein - trans.put("Χ", " Chi "); //Chi groß - trans.put("χ", " chi "); //chi klein - trans.put("Ψ", " Psi "); //Psi groß - trans.put("ψ", " psi "); //psi klein - trans.put("Ω", " Omega "); //Omega groß - trans.put("ω", " omega "); //omega klein - trans.put("ϑ", " theta "); //theta Symbol - trans.put("ϒ", " ypsilon "); //upsilon mit Haken - trans.put("ϖ", " pi "); //pi Symbol - trans.put("∀", " for all "); //für alle - trans.put("∂", " part of "); //teilweise - trans.put("∃", " exists "); //existiert - trans.put("∅", " null "); //leer - trans.put("∇", " nabla "); //nabla - trans.put("∈", " element of "); //Element von - trans.put("∉", " not element of "); //kein Element von - trans.put("∋", " contains "); //enthält als Element - trans.put("∏", " product "); //Produkt - trans.put("∑", " sum "); //Summe - trans.put("−", " minus "); //minus - trans.put("∗", " times "); //Asterisk - trans.put("√", " sqare root "); //Quadratwurzel - trans.put("∝", " proportional to "); //proportional zu - trans.put("∞", " unlimited "); //unendlich - trans.put("∠", " angle "); //Winkel - trans.put("∧", " and "); //und - trans.put("∨", " or "); //oder - trans.put("∩", " "); //Schnittpunkt - trans.put("∪", " unity "); //Einheit - trans.put("∫", " integral "); //Integral - trans.put("∴", " cause "); //deshalb - trans.put("∼", " similar to "); //ähnlich wie - trans.put("≅", " equal "); //annähernd gleich - trans.put("≈", " equal "); //beinahe gleich - trans.put("≠", " not equal "); //ungleich - trans.put("≡", " identical "); //identisch mit - trans.put("≤", " smaller or equal than "); //kleiner gleich - trans.put("≥", " greater or equal than "); //größer gleich - trans.put("⊂", " subset of "); //Untermenge von - trans.put("⊃", " superset of "); //Obermenge von - trans.put("⊄", " not subset of "); //keine Untermenge von - trans.put("⊆", ""); //Untermenge von oder gleich mit - trans.put("⊇", ""); //Obermenge von oder gleich mit - trans.put("⊕", ""); //Direktsumme - trans.put("⊗", ""); //Vektorprodukt - trans.put("⊥", ""); //senkrecht zu - trans.put("⋅", ""); //Punkt-Operator - trans.put("◊", ""); //Raute - trans.put("⌈", ""); //links oben - trans.put("⌉", ""); //rechts oben - trans.put("⌊", ""); //links unten - trans.put("⌋", ""); //rechts unten - trans.put("⟨", ""); //spitze Klammer links - trans.put("⟩", ""); //spitze Klammer rechts - trans.put("←", ""); //Pfeil links - trans.put("↑", ""); //Pfeil oben - trans.put("→", ""); //Pfeil rechts - trans.put("↓", ""); //Pfeil unten - trans.put("↔", ""); //Pfeil links/rechts - trans.put("↵", ""); //Pfeil unten-Knick-links - trans.put("⇐", ""); //Doppelpfeil links - trans.put("⇑", ""); //Doppelpfeil oben - trans.put("⇒", ""); //Doppelpfeil rechts - trans.put("⇓", ""); //Doppelpfeil unten - trans.put("⇔", ""); //Doppelpfeil links/rechts - trans.put("•", ""); //Bullet-Zeichen - trans.put("…", ""); //Horizontale Ellipse - trans.put("′", ""); //Minutenzeichen - trans.put("‾", ""); //Überstrich - trans.put("⁄", ""); //Bruchstrich - trans.put("℘", ""); //Weierstrass p - trans.put("ℑ", ""); //Zeichen für "imaginär" - trans.put("ℜ", ""); //Zeichen für "real" - trans.put("™", ""); //Trademark-Zeichen - trans.put("€", ""); //Euro-Zeichen - trans.put("ℵ", ""); //Alef-Symbol - trans.put("♠", ""); //Pik-Zeichen - trans.put("♣", ""); //Kreuz-Zeichen - trans.put("♥", ""); //Herz-Zeichen - trans.put("♦", ""); //Karo-Zeichen - trans.put(" ", ""); //Leerzeichen Breite n - trans.put(" ", ""); //Leerzeichen Breite m - trans.put(" ", ""); //Schmales Leerzeichen - trans.put("‌", ""); //null breiter Nichtverbinder - trans.put("‍", ""); //null breiter Verbinder - trans.put("‎", ""); //links-nach-rechts-Zeichen - trans.put("‏", ""); //rechts-nach-links-Zeichen - trans.put("–", ""); //Gedankenstrich Breite n - trans.put("—", ""); //Gedankenstrich Breite m - trans.put("‘", ""); //einfaches Anführungszeichen links - trans.put("’", ""); //einfaches Anführungszeichen rechts - trans.put("‚", ""); //einfaches low-9-Zeichen - trans.put("“", ""); //doppeltes Anführungszeichen links - trans.put("”", ""); //doppeltes Anführungszeichen rechts - trans.put("„", ""); //doppeltes low-9-Zeichen rechts - trans.put("†", ""); //Kreuz - trans.put("‡", ""); //Doppelkreuz - trans.put("‰", ""); //zu tausend - trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links - trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts - } - - - public htmlFilterAbstractScraper(HashSet tags0, HashSet tags1) { - this.tags0 = tags0; - this.tags1 = tags1; - } - - public boolean isTag0(String tag) { - return (tags0 != null) && (tags0.contains(tag)); - } - - public boolean isTag1(String tag) { - return (tags1 != null) && (tags1.contains(tag)); - } - - //the 'missing' method that shall be implemented: - public abstract void scrapeText(byte[] text); - - // the other methods must take into account to construct the return value correctly - public void scrapeTag0(String tagname, Properties tagopts) { - } - - public void scrapeTag1(String tagname, Properties tagopts, byte[] text) { - } - - // string conversions - private static String code_iso8859s(int c) { - switch ((int) c & 0xff) { - - // german umlaute and ligaturen - case 0xc4: return "AE"; case 0xd6: return "OE"; case 0xdc: return "UE"; - case 0xe4: return "ae"; case 0xf6: return "oe"; case 0xfc: return "ue"; - case 0xdf: return "ss"; - - // accent on letters; i.e. french characters - case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc5: return "A"; - case 0xc6: return "AE"; - case 0xc7: return "C"; - case 0xc8: case 0xc9: case 0xca: return "E"; - case 0xcc: case 0xcd: case 0xce: case 0xcf: return "I"; - case 0xd0: return "D"; - case 0xd1: return "N"; - case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd8: return "O"; - case 0xd7: return "x"; - case 0xd9: case 0xda: case 0xdb: return "U"; - case 0xdd: return "Y"; - case 0xde: return "p"; - - case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe5: return "a"; - case 0xe6: return "ae"; - case 0xe7: return "c"; - case 0xe8: case 0xe9: case 0xea: return "e"; - case 0xec: case 0xed: case 0xee: case 0xef: return "i"; - case 0xf0: return "d"; - case 0xf1: return "n"; - case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf8: return "o"; - case 0xf7: return "%"; - case 0xf9: case 0xfa: case 0xfb: return "u"; - case 0xfd: case 0xff: return "y"; - case 0xfe: return "p"; - - // special characters - case 0xa4: return " euro "; - default: return null; - } - } - - public static serverByteBuffer convertUmlaute(serverByteBuffer bb) { - serverByteBuffer t = new serverByteBuffer(bb.length() + 20); - int b0, b1, b2; - String z; - int i = 0; - while (i < bb.length()) { - b0 = bb.byteAt(i) & 0xff; - // check utf-8 encoding - if (b0 < 128) { - t.append(b0); - i++; - } else { - b1 = bb.byteAt(i + 1) & 0xff; - if ((b0 > 0xbf) && (b0 < 0xe0)) { - z = code_iso8859s(((b0 & 0x1f) << 0x6) | (b1 & 0x3f)); - i += 2; - } else { - b2 = bb.byteAt(i + 2) & 0xff; - z = code_iso8859s(((b0 & 0xf) << 0xc) | ((b1 & 0x3f) << 0x6) | (b2 & 0x3f)); - i += 3; - } - if (z == null) t.append(b0); else t.append(z); - } - } - return t; - } - - private static byte[] transscript(byte[] code) { - String t = (String) trans.get(new String(code)); - if (t == null) return new byte[0]; else return t.getBytes(); - } - - protected static serverByteBuffer transscriptAll(serverByteBuffer bb) { - int p0 = 0, p1; - byte[] t; - while ((p0 = bb.indexOf((byte) '&', p0)) >= 0) { - p1 = bb.indexOf((byte) ';', p0); - if (p1 >= 0) { - t = transscript(bb.getBytes(p0, p1 + 1)); - bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getBytes(p1 + 1)); - } else { - bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).append(bb.getBytes(p0 + 1)); - } - } - t = null; - return bb; - } - - protected static serverByteBuffer stripAllTags(serverByteBuffer bb) { - int p0 = 0, p1; - while ((p0 = bb.indexOf(lb, p0)) >= 0) { - p1 = bb.indexOf(rb, p0); - if (p1 >= 0) { - bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + 1).trim().append((byte) 32).append(new serverByteBuffer(bb.getBytes(p1 + 1)).trim()); - } else { - bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).trim().append(new serverByteBuffer(bb.getBytes(p0 + 1)).trim()); - } - } - return bb.trim(); - } - - public static serverByteBuffer stripAll(serverByteBuffer bb) { - //return stripAllTags(s); - return convertUmlaute(transscriptAll(stripAllTags(bb))); - } - - public void close() { - // free resources - tags0 = null; - tags1 = null; - } - - public void finalize() { - close(); - } - -} +// htmlFilterAbstractScraper.java // --------------------------- // (C) by Michael Peter Christen; mc@anomic.de // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // last major change: 18.02.2004 // // You agree that the Author(s) is (are) not responsible for cost, // loss of data or any harm that may be caused by usage of this softare or // this documentation. The usage of this software is on your own risk. The // installation and usage (starting/running) of this software may allow other // people or application to access your computer and any attached devices and // is highly dependent on the configuration of the software which must be // done by the user of the software;the author(s) is (are) also // not responsible for proper configuration and usage of the software, even // if provoked by documentation provided together with the software. // // THE SOFTWARE THAT FOLLOWS AS ART OF PROGRAMMING BELOW THIS SECTION // IS PUBLISHED UNDER THE GPL AS DOCUMENTED IN THE FILE gpl.txt ASIDE THIS // FILE AND AS IN http://www.gnu.org/licenses/gpl.txt // ANY CHANGES TO THIS FILE ACCORDING TO THE GPL CAN BE DONE TO THE // LINES THAT FOLLOWS THIS COPYRIGHT NOTICE HERE, BUT CHANGES MUST NOT // BE DONE ABOVE OR INSIDE THE COPYRIGHT NOTICE. A RE-DISTRIBUTION // MUST CONTAIN THE INTACT AND UNCHANGED COPYRIGHT NOTICE. // CONTRIBUTIONS AND CHANGES TO THE PROGRAM CODE SHOULD BE MARKED AS SUCH. package de.anomic.htmlFilter; import java.util.HashSet; import java.util.HashMap; import java.util.Properties; import de.anomic.server.serverByteBuffer; public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { public static final byte lb = (byte) '<'; public static final byte rb = (byte) '>'; public static final byte sl = (byte) '/'; private HashSet tags0; private HashSet tags1; // define a translation table for html character codings private static HashMap trans = new HashMap(300); static { trans.put(""", "\""); //Anführungszeichen oben trans.put("&", "&"); //Ampersand-Zeichen, kaufmännisches Und trans.put("<", "<"); //öffnende spitze Klammer trans.put(">", ">"); //schließende spitze Klammer trans.put(" ", " "); //Erzwungenes Leerzeichen trans.put("¡", "!"); //umgekehrtes Ausrufezeichen trans.put("¢", " cent "); //Cent-Zeichen trans.put("£", " pound "); //Pfund-Zeichen trans.put("¤", " currency "); //Währungs-Zeichen trans.put("¥", " yen "); //Yen-Zeichen trans.put("¦", " "); //durchbrochener Strich trans.put("§", " paragraph "); //Paragraph-Zeichen trans.put("¨", " "); //Pünktchen oben trans.put("©", " copyright "); //Copyright-Zeichen trans.put("ª", " "); //Ordinal-Zeichen weiblich trans.put("«", " "); //angewinkelte Anführungszeichen links trans.put("¬", " not "); //Verneinungs-Zeichen trans.put("­", "-"); //kurzer Trennstrich trans.put("®", " trademark "); //Registriermarke-Zeichen trans.put("¯", " "); //Überstrich trans.put("°", " degree "); //Grad-Zeichen trans.put("±", " +/- "); //Plusminus-Zeichen trans.put("²", " square "); //Hoch-2-Zeichen trans.put("³", " 3 "); //Hoch-3-Zeichen trans.put("´", " "); //Acute-Zeichen trans.put("µ", " micro "); //Mikro-Zeichen trans.put("¶", " paragraph "); //Absatz-Zeichen trans.put("·", " "); //Mittelpunkt trans.put("¸", " "); //Häkchen unten trans.put("¹", " "); //Hoch-1-Zeichen trans.put("º", " degree "); //Ordinal-Zeichen männlich trans.put("»", " "); //angewinkelte Anführungszeichen rechts trans.put("¼", " quarter "); //ein Viertel trans.put("½", " half "); //ein Halb trans.put("¾", " 3/4 "); //drei Viertel trans.put("¿", "?"); //umgekehrtes Fragezeichen trans.put("À", "A"); //A mit Accent grave trans.put("Á", "A"); //A mit Accent acute trans.put("Â", "A"); //A mit Circumflex trans.put("Ã", "A"); //A mit Tilde trans.put("Ä", "Ae"); //A Umlaut trans.put("Å", "A"); //A mit Ring trans.put("Æ", "A"); //A mit legiertem E trans.put("Ç", "C"); //C mit Häkchen trans.put("È", "E"); //E mit Accent grave trans.put("É", "E"); //E mit Accent acute trans.put("Ê", "E"); //E mit Circumflex trans.put("Ë", "E"); //E Umlaut trans.put("Ì", "I"); //I mit Accent grave trans.put("Í", "I"); //I mit Accent acute trans.put("Î", "I"); //I mit Circumflex trans.put("Ï", "I"); //I Umlaut trans.put("Ð", "D"); //Eth (isländisch) trans.put("Ñ", "N"); //N mit Tilde trans.put("Ò", "O"); //O mit Accent grave trans.put("Ó", "O"); //O mit Accent acute trans.put("Ô", "O"); //O mit Circumflex trans.put("Õ", "O"); //O mit Tilde trans.put("Ö", "Oe"); //O Umlaut trans.put("×", " times "); //Mal-Zeichen trans.put("Ø", "O"); //O mit Schrägstrich trans.put("Ù", "U"); //U mit Accent grave trans.put("Ú", "U"); //U mit Accent acute trans.put("Û", "U"); //U mit Circumflex trans.put("Ü", "Ue"); //U Umlaut trans.put("Ý", "Y"); //Y mit Accent acute trans.put("Þ", "P"); //THORN (isländisch) trans.put("ß", "ss"); //scharfes S trans.put("à", "a"); //a mit Accent grave trans.put("á", "a"); //a mit Accent acute trans.put("â", "a"); //a mit Circumflex trans.put("ã", "a"); //a mit Tilde trans.put("ä", "ae"); //a Umlaut trans.put("å", "a"); //a mit Ring trans.put("æ", "a"); //a mit legiertem e trans.put("ç", "c"); //c mit Häkchen trans.put("è", "e"); //e mit Accent grave trans.put("é", "e"); //e mit Accent acute trans.put("ê", "e"); //e mit Circumflex trans.put("ë", "e"); //e Umlaut trans.put("ì", "i"); //i mit Accent grave trans.put("í", "i"); //i mit Accent acute trans.put("î", "i"); //i mit Circumflex trans.put("ï", "i"); //i Umlaut trans.put("ð", "d"); //eth (isländisch) trans.put("ñ", "n"); //n mit Tilde trans.put("ò", "o"); //o mit Accent grave trans.put("ó", "o"); //o mit Accent acute trans.put("ô", "o"); //o mit Circumflex trans.put("õ", "o"); //o mit Tilde trans.put("ö", "oe"); //o Umlaut trans.put("÷", "%"); //Divisions-Zeichen trans.put("ø", "o"); //o mit Schrägstrich trans.put("ù", "u"); //u mit Accent grave trans.put("ú", "u"); //u mit Accent acute trans.put("û", "u"); //u mit Circumflex trans.put("ü", "ue"); //u Umlaut trans.put("ý", "y"); //y mit Accent acute trans.put("þ", "p"); //thorn (isländisch) trans.put("ÿ", "y"); //y Umlaut trans.put("Α", " Alpha "); //Alpha groß trans.put("α", " alpha "); //alpha klein trans.put("Β", " Beta "); //Beta groß trans.put("β", " beta "); //beta klein trans.put("Γ", " Gamma "); //Gamma groß trans.put("γ", " gamma "); //gamma klein trans.put("Δ", " Delta "); //Delta groß trans.put("δ", " delta "); //delta klein trans.put("Ε", " Epsilon "); //Epsilon groß trans.put("ε", " epsilon "); //epsilon klein trans.put("Ζ", " Zeta "); //Zeta groß trans.put("ζ", " zeta "); //zeta klein trans.put("Η", " Eta "); //Eta groß trans.put("η", " eta "); //eta klein trans.put("Θ", " Theta "); //Theta groß trans.put("θ", " theta "); //theta klein trans.put("Ι", " Iota "); //Iota groß trans.put("ι", " iota "); //iota klein trans.put("Κ", " Kappa "); //Kappa groß trans.put("κ", " kappa "); //kappa klein trans.put("Λ", " Lambda "); //Lambda groß trans.put("λ", " lambda "); //lambda klein trans.put("Μ", " Mu "); //Mu groß trans.put("μ", " mu "); //mu klein trans.put("Ν", " Nu "); //Nu groß trans.put("ν", " nu "); //nu klein trans.put("Ξ", " Xi "); //Xi groß trans.put("ξ", " xi "); //xi klein trans.put("Ο", " Omicron "); //Omicron groß trans.put("ο", " omicron "); //omicron klein trans.put("Π", " Pi "); //Pi groß trans.put("π", " pi "); //pi klein trans.put("Ρ", " Rho "); //Rho groß trans.put("ρ", " rho "); //rho klein trans.put("Σ", " Sigma "); //Sigma groß trans.put("ς", " sigma "); //sigmaf klein trans.put("σ", " sigma "); //sigma klein trans.put("Τ", " Tau "); //Tau groß trans.put("τ", " tau "); //tau klein trans.put("Υ", " Ypsilon "); //Upsilon groß trans.put("υ", " ypsilon "); //upsilon klein trans.put("Φ", " Phi "); //Phi groß trans.put("φ", " phi "); //phi klein trans.put("Χ", " Chi "); //Chi groß trans.put("χ", " chi "); //chi klein trans.put("Ψ", " Psi "); //Psi groß trans.put("ψ", " psi "); //psi klein trans.put("Ω", " Omega "); //Omega groß trans.put("ω", " omega "); //omega klein trans.put("ϑ", " theta "); //theta Symbol trans.put("ϒ", " ypsilon "); //upsilon mit Haken trans.put("ϖ", " pi "); //pi Symbol trans.put("∀", " for all "); //für alle trans.put("∂", " part of "); //teilweise trans.put("∃", " exists "); //existiert trans.put("∅", " null "); //leer trans.put("∇", " nabla "); //nabla trans.put("∈", " element of "); //Element von trans.put("∉", " not element of "); //kein Element von trans.put("∋", " contains "); //enthält als Element trans.put("∏", " product "); //Produkt trans.put("∑", " sum "); //Summe trans.put("−", " minus "); //minus trans.put("∗", " times "); //Asterisk trans.put("√", " sqare root "); //Quadratwurzel trans.put("∝", " proportional to "); //proportional zu trans.put("∞", " unlimited "); //unendlich trans.put("∠", " angle "); //Winkel trans.put("∧", " and "); //und trans.put("∨", " or "); //oder trans.put("∩", " "); //Schnittpunkt trans.put("∪", " unity "); //Einheit trans.put("∫", " integral "); //Integral trans.put("∴", " cause "); //deshalb trans.put("∼", " similar to "); //ähnlich wie trans.put("≅", " equal "); //annähernd gleich trans.put("≈", " equal "); //beinahe gleich trans.put("≠", " not equal "); //ungleich trans.put("≡", " identical "); //identisch mit trans.put("≤", " smaller or equal than "); //kleiner gleich trans.put("≥", " greater or equal than "); //größer gleich trans.put("⊂", " subset of "); //Untermenge von trans.put("⊃", " superset of "); //Obermenge von trans.put("⊄", " not subset of "); //keine Untermenge von trans.put("⊆", ""); //Untermenge von oder gleich mit trans.put("⊇", ""); //Obermenge von oder gleich mit trans.put("⊕", ""); //Direktsumme trans.put("⊗", ""); //Vektorprodukt trans.put("⊥", ""); //senkrecht zu trans.put("⋅", ""); //Punkt-Operator trans.put("◊", ""); //Raute trans.put("⌈", ""); //links oben trans.put("⌉", ""); //rechts oben trans.put("⌊", ""); //links unten trans.put("⌋", ""); //rechts unten trans.put("⟨", ""); //spitze Klammer links trans.put("⟩", ""); //spitze Klammer rechts trans.put("←", ""); //Pfeil links trans.put("↑", ""); //Pfeil oben trans.put("→", ""); //Pfeil rechts trans.put("↓", ""); //Pfeil unten trans.put("↔", ""); //Pfeil links/rechts trans.put("↵", ""); //Pfeil unten-Knick-links trans.put("⇐", ""); //Doppelpfeil links trans.put("⇑", ""); //Doppelpfeil oben trans.put("⇒", ""); //Doppelpfeil rechts trans.put("⇓", ""); //Doppelpfeil unten trans.put("⇔", ""); //Doppelpfeil links/rechts trans.put("•", ""); //Bullet-Zeichen trans.put("…", ""); //Horizontale Ellipse trans.put("′", ""); //Minutenzeichen trans.put("‾", ""); //Überstrich trans.put("⁄", ""); //Bruchstrich trans.put("℘", ""); //Weierstrass p trans.put("ℑ", ""); //Zeichen für "imaginär" trans.put("ℜ", ""); //Zeichen für "real" trans.put("™", ""); //Trademark-Zeichen trans.put("€", ""); //Euro-Zeichen trans.put("ℵ", ""); //Alef-Symbol trans.put("♠", ""); //Pik-Zeichen trans.put("♣", ""); //Kreuz-Zeichen trans.put("♥", ""); //Herz-Zeichen trans.put("♦", ""); //Karo-Zeichen trans.put(" ", ""); //Leerzeichen Breite n trans.put(" ", ""); //Leerzeichen Breite m trans.put(" ", ""); //Schmales Leerzeichen trans.put("‌", ""); //null breiter Nichtverbinder trans.put("‍", ""); //null breiter Verbinder trans.put("‎", ""); //links-nach-rechts-Zeichen trans.put("‏", ""); //rechts-nach-links-Zeichen trans.put("–", ""); //Gedankenstrich Breite n trans.put("—", ""); //Gedankenstrich Breite m trans.put("‘", ""); //einfaches Anführungszeichen links trans.put("’", ""); //einfaches Anführungszeichen rechts trans.put("‚", ""); //einfaches low-9-Zeichen trans.put("“", ""); //doppeltes Anführungszeichen links trans.put("”", ""); //doppeltes Anführungszeichen rechts trans.put("„", ""); //doppeltes low-9-Zeichen rechts trans.put("†", ""); //Kreuz trans.put("‡", ""); //Doppelkreuz trans.put("‰", ""); //zu tausend trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts } public htmlFilterAbstractScraper(HashSet tags0, HashSet tags1) { this.tags0 = tags0; this.tags1 = tags1; } public boolean isTag0(String tag) { return (tags0 != null) && (tags0.contains(tag)); } public boolean isTag1(String tag) { return (tags1 != null) && (tags1.contains(tag)); } //the 'missing' method that shall be implemented: public abstract void scrapeText(byte[] text); // the other methods must take into account to construct the return value correctly public void scrapeTag0(String tagname, Properties tagopts) { } public void scrapeTag1(String tagname, Properties tagopts, byte[] text) { } // string conversions private static String code_iso8859s(int c) { switch ((int) c & 0xff) { // german umlaute and ligaturen case 0xc4: return "AE"; case 0xd6: return "OE"; case 0xdc: return "UE"; case 0xe4: return "ae"; case 0xf6: return "oe"; case 0xfc: return "ue"; case 0xdf: return "ss"; // accent on letters; i.e. french characters case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc5: return "A"; case 0xc6: return "AE"; case 0xc7: return "C"; case 0xc8: case 0xc9: case 0xca: return "E"; case 0xcc: case 0xcd: case 0xce: case 0xcf: return "I"; case 0xd0: return "D"; case 0xd1: return "N"; case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd8: return "O"; case 0xd7: return "x"; case 0xd9: case 0xda: case 0xdb: return "U"; case 0xdd: return "Y"; case 0xde: return "p"; case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe5: return "a"; case 0xe6: return "ae"; case 0xe7: return "c"; case 0xe8: case 0xe9: case 0xea: return "e"; case 0xec: case 0xed: case 0xee: case 0xef: return "i"; case 0xf0: return "d"; case 0xf1: return "n"; case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf8: return "o"; case 0xf7: return "%"; case 0xf9: case 0xfa: case 0xfb: return "u"; case 0xfd: case 0xff: return "y"; case 0xfe: return "p"; // special characters case 0xa4: return " euro "; default: return null; } } public static serverByteBuffer convertUmlaute(serverByteBuffer bb) { serverByteBuffer t = new serverByteBuffer(bb.length() + 20); int b0, b1, b2; String z; int i = 0; while (i < bb.length()) { b0 = bb.byteAt(i) & 0xff; // check utf-8 encoding if ((b0 < 128) || (i + 1 == bb.length())) { t.append(b0); i++; } else { b1 = bb.byteAt(i + 1) & 0xff; if (b1 > 0x3f) { z = code_iso8859s(b0); i++; } else if ((b0 > 0xbf) && (b0 < 0xe0)) { z = code_iso8859s(((b0 & 0x1f) << 0x6) | (b1 & 0x3f)); i += 2; } else { if (i + 2 >= bb.length()) { z = null; i++; } else { b2 = bb.byteAt(i + 2) & 0xff; if (b2 > 0x3f) { z = code_iso8859s(b0); i++; } else { z = code_iso8859s(((b0 & 0xf) << 0xc) | ((b1 & 0x3f) << 0x6) | (b2 & 0x3f)); i += 3; } } } if (z == null) t.append(b0); else t.append(z); } } return t; } private static byte[] transscript(byte[] code) { String t = (String) trans.get(new String(code)); if (t == null) return new byte[0]; else return t.getBytes(); } protected static serverByteBuffer transscriptAll(serverByteBuffer bb) { int p0 = 0, p1; byte[] t; while ((p0 = bb.indexOf((byte) '&', p0)) >= 0) { p1 = bb.indexOf((byte) ';', p0); if (p1 >= 0) { t = transscript(bb.getBytes(p0, p1 + 1)); bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getBytes(p1 + 1)); } else { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).append(bb.getBytes(p0 + 1)); } } t = null; return bb; } protected static serverByteBuffer stripAllTags(serverByteBuffer bb) { int p0 = 0, p1; while ((p0 = bb.indexOf(lb, p0)) >= 0) { p1 = bb.indexOf(rb, p0); if (p1 >= 0) { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + 1).trim().append((byte) 32).append(new serverByteBuffer(bb.getBytes(p1 + 1)).trim()); } else { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).trim().append(new serverByteBuffer(bb.getBytes(p0 + 1)).trim()); } } return bb.trim(); } public static serverByteBuffer stripAll(serverByteBuffer bb) { //return stripAllTags(s); return convertUmlaute(transscriptAll(stripAllTags(bb))); } public void close() { // free resources tags0 = null; tags1 = null; } public void finalize() { close(); } } \ No newline at end of file diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index a4751847d..3fb292c10 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -41,6 +41,7 @@ package de.anomic.htmlFilter; import java.net.URL; +import java.net.MalformedURLException; import java.util.HashMap; import java.util.HashSet; import java.util.Map; @@ -178,4 +179,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen System.out.println("TEXT :" + new String(text.getBytes())); } + + public static void main(String[] args) { + String test = "Nokia kürzt bei Forschung und Entwicklung"; + try { + htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL("http://localhost")); + scraper.scrapeText(test.getBytes()); + System.out.println(new String(scraper.getText())); + } catch (MalformedURLException e) {} + } + } diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index ab2c8811c..2ad248f2a 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -149,8 +149,8 @@ public final class plasmaParser { * @see #initMediaExt(String) */ static { - initMediaExt(extString2extList("swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," + - "sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj")); + initMediaExt(extString2extList("swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," + + "sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj")); /* =================================================== * initializing the parser object pool @@ -383,21 +383,21 @@ public final class plasmaParser { private static void loadEnabledParserList() { // loading a list of availabe parser from file - Properties prop = new Properties(); + Properties prop = new Properties(); BufferedInputStream bufferedIn = null; - try { - prop.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("yacy.parser")))); - } catch (IOException e) { - System.err.println("ERROR: yacy.parser not found in settings path"); - } finally { + try { + prop.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("yacy.parser")))); + } catch (IOException e) { + System.err.println("ERROR: yacy.parser not found in settings path"); + } finally { if (bufferedIn != null) try{ bufferedIn.close(); }catch(Exception e){} } - + // enable them ... setEnabledParserList(prop.keySet()); - } - - private static void loadAvailableParserList() { + } + + private static void loadAvailableParserList() { try { plasmaParser.availableParserList.clear(); @@ -405,24 +405,24 @@ public final class plasmaParser { String javaClassPath = System.getProperty("java.class.path"); // getting the current package name - String plasmaParserPkgName = plasmaParser.class.getPackage().getName() + ".parser"; + String plasmaParserPkgName = plasmaParser.class.getPackage().getName() + ".parser"; serverLog.logInfo("PARSER","Searching for additional content parsers in package " + plasmaParserPkgName); - + // getting an uri to the parser subpackage - String packageURI = plasmaParser.class.getResource("/"+plasmaParserPkgName.replace('.','/')).toString(); - serverLog.logDebug("PARSER", "Parser directory is " + packageURI); - + String packageURI = plasmaParser.class.getResource("/"+plasmaParserPkgName.replace('.','/')).toString(); + serverLog.logDebug("PARSER", "Parser directory is " + packageURI); + // open the parser directory - File parserDir = new File(new URI(packageURI)); + File parserDir = new File(new URI(packageURI)); if ((parserDir == null) || (!parserDir.exists()) || (!parserDir.isDirectory())) return; - /* - * loop through all subdirectories and test if we can + /* + * loop through all subdirectories and test if we can * find an additional parser class */ File[] parserDirectories = parserDir.listFiles(parserDirectoryFilter); if (parserDirectories == null) return; - for (int parserDirNr=0; parserDirNr< parserDirectories.length; parserDirNr++) { + for (int parserDirNr=0; parserDirNr< parserDirectories.length; parserDirNr++) { File currentDir = parserDirectories[parserDirNr]; serverLog.logDebug("PARSER", "Searching in directory " + currentDir.toString()); String[] parserClasses = currentDir.list(parserFileNameFilter); @@ -432,7 +432,7 @@ public final class plasmaParser { serverLog.logDebug("PARSER", "Testing parser class " + parserClasses[parserNr]); String className = parserClasses[parserNr].substring(0,parserClasses[parserNr].indexOf(".class")); String fullClassName = plasmaParserPkgName + "." + currentDir.getName() + "." + className; - try { + try { // trying to load the parser class by its name Class parserClass = Class.forName(fullClassName); Object theParser = parserClass.newInstance(); @@ -446,7 +446,7 @@ public final class plasmaParser { throw new ParserException("Missing dependency detected: '" + neededLibx[libxId] + "'."); } } - } + } // loading the list of mime-types that are supported by this parser class Hashtable supportedMimeTypes = ((Parser)theParser).getSupportedMimeTypes(); @@ -456,31 +456,31 @@ public final class plasmaParser { availableParserList.put(mimeType,fullClassName); serverLog.logInfo("PARSER", "Found functional parser for mimeType '" + mimeType + "'."); } - - } catch (Exception e) { /* we can ignore this for the moment */ + + } catch (Exception e) { /* we can ignore this for the moment */ serverLog.logWarning("PARSER", "Parser '" + className + "' doesn't work correctly and will be ignored.\n [" + e.getClass().getName() + "]: " + e.getMessage()); - } catch (Error e) { /* we can ignore this for the moment */ + } catch (Error e) { /* we can ignore this for the moment */ serverLog.logWarning("PARSER", "Parser '" + className + "' doesn't work correctly and will be ignored.\n [" + e.getClass().getName() + "]: " + e.getMessage()); } } - } + } } catch (Exception e) { serverLog.logError("PARSER", "Unable to determine all installed parsers. " + e.getMessage()); - } - } - - public void close() { + } + } + + public void close() { // clearing the parser list synchronized (this.enabledParserList) { - this.enabledParserList.clear(); - } + this.enabledParserList.clear(); + } // closing the parser object pool - try { - this.theParserPool.close(); - } catch (Exception e) { } - } + try { + this.theParserPool.close(); + } catch (Exception e) { } + } public plasmaParserDocument parseSource(URL location, String mimeType, byte[] source) { @@ -498,7 +498,6 @@ public final class plasmaParser { // ... otherwise we make a html scraper and transformer htmlFilterContentScraper scraper = new htmlFilterContentScraper(location); OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false); - hfos.write(source); hfos.close(); return transformScraper(location, mimeType, scraper); @@ -660,22 +659,24 @@ public final class plasmaParser { return v; } - public static void main(String[] args) { - //javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java - //java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out - try { - File in = new File(args[0]); - File out = new File(args[1]); - plasmaParser theParser = new plasmaParser(); + public static void main(String[] args) { + //javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java + //java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out + try { + File in = new File(args[0]); + //File out = new File(args[1]); + plasmaParser theParser = new plasmaParser(); theParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain"); theParser.initParseableMimeTypes("application/atom+xml,application/gzip,application/java-archive,application/msword,application/octet-stream,application/pdf,application/rdf+xml,application/rss+xml,application/rtf,application/x-gzip,application/x-tar,application/xml,application/zip,text/rss,text/rtf,text/xml,application/x-bzip2"); - FileInputStream theInput = new FileInputStream(in); - ByteArrayOutputStream theOutput = new ByteArrayOutputStream(); - serverFileUtils.copy(theInput, theOutput); - plasmaParserDocument document = theParser.parseSource(new URL("http://brain/~theli/test.pdf"), null, theOutput.toByteArray()); - //plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray()); - byte[] theText = document.getText(); - serverFileUtils.write(theText, out); + FileInputStream theInput = new FileInputStream(in); + ByteArrayOutputStream theOutput = new ByteArrayOutputStream(); + serverFileUtils.copy(theInput, theOutput); + plasmaParserDocument document = theParser.parseSource(new URL("http://brain/~theli/test.pdf"), null, theOutput.toByteArray()); + //plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray()); + //byte[] theText = document.getText(); + //serverFileUtils.write(theText, out); + String[] sentences = document.getSentences(); + for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]); } catch (Exception e) { e.printStackTrace(); } diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index a51785167..d3bdbfdf3 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -105,6 +105,7 @@ public class plasmaSnippetCache { } public result retrieve(URL url, Set queryhashes, boolean fetchOnline) { + // heise = "0OQUNU3JSs05" if (queryhashes.size() == 0) { //System.out.println("found no queryhashes for url retrieve " + url); return new result(null, SOURCE_ERROR, "no query hashes given"); @@ -250,7 +251,7 @@ public class plasmaSnippetCache { } catch (IOException e) {} if (header == null) { - String filename = url.getFile(); + String filename = cacheManager.getCachePath(url).getName(); int p = filename.lastIndexOf('.'); if ((p < 0) || ((p >= 0) && (plasmaParser.supportedFileExtContains(filename.substring(p + 1))))) { diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 537fdc58c..1c4ec2b46 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -371,6 +371,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser deployThread("99_indexcachemigration", "index cache migration", "migration of index cache data structures 0.37 -> 0.38", new serverInstantThread(classicCache, "oneStepMigration", "size"), 30000); } + + // test routine for snippet fetch + // url = /www.heise.de/mobil/newsticker/meldung/mail/54980 + Set query = new HashSet(); query.add("0OQUNU3JSs05"); // 'heise' + //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/mobil/newsticker/meldung/mail/54980"), query, true); + plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/security/news/foren/go.shtml?read=1&msg_id=7301419&forum_id=72721"), query, true); } private static String ppRamString(int bytes) { diff --git a/source/de/anomic/server/serverFileUtils.java b/source/de/anomic/server/serverFileUtils.java index 158fd147a..cd19d0d72 100644 --- a/source/de/anomic/server/serverFileUtils.java +++ b/source/de/anomic/server/serverFileUtils.java @@ -90,12 +90,12 @@ public final class serverFileUtils { FileInputStream fis = null; FileOutputStream fos = null; try { - fis = new FileInputStream(source); - fos = new FileOutputStream(dest); - copy(fis, fos); + fis = new FileInputStream(source); + fos = new FileOutputStream(dest); + copy(fis, fos); } finally { if (fis != null) try {fis.close();} catch (Exception e) {} - if (fos != null) try {fos.close();} catch (Exception e) {} + if (fos != null) try {fos.close();} catch (Exception e) {} } } @@ -107,16 +107,16 @@ public final class serverFileUtils { } public static byte[] read(File source) throws IOException { - byte[] buffer = new byte[(int) source.length()]; - InputStream fis = null; - try { - fis = new FileInputStream(source); - int p = 0, c; - while ((c = fis.read(buffer, p, buffer.length - p)) > 0) p += c; - } finally { + byte[] buffer = new byte[(int) source.length()]; + InputStream fis = null; + try { + fis = new FileInputStream(source); + int p = 0, c; + while ((c = fis.read(buffer, p, buffer.length - p)) > 0) p += c; + } finally { if (fis != null) try { fis.close(); } catch (Exception e) {} - } - return buffer; + } + return buffer; } public static byte[] readAndZip(File source) throws IOException { diff --git a/source/yacy.java b/source/yacy.java index 2af5914c4..69cdaed4b 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -148,6 +148,10 @@ public final class yacy { plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf"); + // hardcoded, forced, temporary value-migration + sb.setConfig("htTemplatePath", "htroot/env/templates"); + sb.setConfig("parseableExt", "html,htm,txt,php,shtml,asp"); + // if we are running an SVN version, we try to detect the used svn revision now ... if (vString.equals("@" + "REPL_VERSION" + "@")) { Properties buildProp = new Properties(); @@ -188,9 +192,6 @@ public final class yacy { if (timeout < 60000) timeout = 60000; int maxSessions = Integer.parseInt(sb.getConfig("httpdMaxSessions", "100")); - // hardcoded, forced, temporary value-migration - sb.setConfig("htTemplatePath", "htroot/env/templates"); - // create some directories File htRootPath = new File(sb.getRootPath(), sb.getConfig("htRootPath", "htroot")); File htDocsPath = new File(sb.getRootPath(), sb.getConfig("htDocsPath", "DATA/HTDOCS")); diff --git a/yacy.init b/yacy.init index 74b8dbb76..c0b620b21 100644 --- a/yacy.init +++ b/yacy.init @@ -100,7 +100,7 @@ parseableMimeTypes= # this is important to recognize - tags as not-html reference # These files will be excluded from indexing _(Please keep extensions in alphabetical order)_ mediaExt=ace,arj,asf,avi,bin,bz2,css,deb,doc,dmg,gif,gz,hqx,img,iso,jar,jpe,jpg,jpeg,mpeg,mov,mp3,mpg,ogg,png,pdf,ppt,ps,ram,rar,rm,rpm,sit,swf,sxc,sxd,sxi,sxw,tar,tgz,torrent,wmv,xcf,xls,zip -parseableExt=html,htm,txt +parseableExt=html,htm,txt,php,shtml,asp # Promotion Strings # These strings appear in the Web Mask of the YACY search client