diff --git a/build.properties b/build.properties index f0152f69e..e7dcc5333 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.4 javacTarget=1.4 # Release Configuration -releaseVersion=0.384 +releaseVersion=0.385 releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz #releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr} diff --git a/htroot/IndexCreateIndexingQueue_p.java b/htroot/IndexCreateIndexingQueue_p.java index 7e8c4a8a6..078b1b260 100644 --- a/htroot/IndexCreateIndexingQueue_p.java +++ b/htroot/IndexCreateIndexingQueue_p.java @@ -46,11 +46,13 @@ import java.text.SimpleDateFormat; import java.util.Date; import java.util.Locale; +import java.io.IOException; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.plasmaSwitchboardQueue; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyCore; @@ -83,26 +85,27 @@ public class IndexCreateIndexingQueue_p { boolean dark; int i; - if (switchboard.queueStack.size() == 0) { + if (switchboard.sbQueue.size() == 0) { prop.put("indexing-queue", 0); //is empty } else { prop.put("indexing-queue", 1); - prop.put("indexing-queue_num", switchboard.queueStack.size());//num entries in queue + prop.put("indexing-queue_num", switchboard.sbQueue.size());//num entries in queue dark = true; - plasmaHTCache.Entry pcentry; - for (i = 0; i < switchboard.queueStack.size(); i++) { - pcentry = (plasmaHTCache.Entry) switchboard.queueStack.get(i); + plasmaSwitchboardQueue.Entry pcentry; + for (i = 0; i < switchboard.sbQueue.size(); i++) try { + pcentry = (plasmaSwitchboardQueue.Entry) switchboard.sbQueue.get(i); if (pcentry != null) { initiator = yacyCore.seedDB.getConnected(pcentry.initiator()); prop.put("indexing-queue_list_"+i+"_dark", ((dark) ? 1 : 0)); prop.put("indexing-queue_list_"+i+"_initiator", ((initiator == null) ? "proxy" : initiator.getName())); - prop.put("indexing-queue_list_"+i+"_depth", pcentry.depth); - prop.put("indexing-queue_list_"+i+"_modified", daydate(pcentry.lastModified)); - prop.put("indexing-queue_list_"+i+"_href",((pcentry.scraper == null) ? "0" : ("" + pcentry.scraper.getAnchors().size()))); - prop.put("indexing-queue_list_"+i+"_anchor", ((pcentry.scraper == null) ? "-" : pcentry.scraper.getHeadline()) ); - prop.put("indexing-queue_list_"+i+"_url", pcentry.nomalizedURLString); + prop.put("indexing-queue_list_"+i+"_depth", pcentry.depth()); + prop.put("indexing-queue_list_"+i+"_modified", (pcentry.responseHeader() == null) ? "null" : daydate(pcentry.responseHeader().lastModified())); + prop.put("indexing-queue_list_"+i+"_href", pcentry.forkFactor()); + prop.put("indexing-queue_list_"+i+"_anchor", pcentry.anchorName()); + prop.put("indexing-queue_list_"+i+"_url", pcentry.normalizedURLString()); dark = !dark; } + } catch (IOException e) { } prop.put("indexing-queue_list", i); } diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index 13f5f6135..123ca6970 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -181,7 +181,7 @@ public class IndexCreate_p { prop.put("xdstopwChecked", env.getConfig("xdstopw", "").equals("true") ? 1 : 0); prop.put("xpstopwChecked", env.getConfig("xpstopw", "").equals("true") ? 1 : 0); - int queueStackSize = switchboard.queueStack.size(); + int queueStackSize = switchboard.sbQueue.size(); int loaderThreadsSize = switchboard.cacheLoader.size(); int crawlerListSize = switchboard.urlPool.noticeURL.stackSize(); int completequeue = queueStackSize + loaderThreadsSize + crawlerListSize; diff --git a/source/de/anomic/data/listManager.java b/source/de/anomic/data/listManager.java index 54aabce63..0a9daf4ab 100644 --- a/source/de/anomic/data/listManager.java +++ b/source/de/anomic/data/listManager.java @@ -51,9 +51,9 @@ import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; +import java.util.Iterator; import java.util.TreeMap; import java.util.Vector; -import java.util.Iterator; import de.anomic.http.httpdProxyHandler; import de.anomic.plasma.plasmaSwitchboard; diff --git a/source/de/anomic/data/translator.java b/source/de/anomic/data/translator.java index 8b31a272c..0b47ff343 100644 --- a/source/de/anomic/data/translator.java +++ b/source/de/anomic/data/translator.java @@ -44,21 +44,19 @@ package de.anomic.data; -import java.io.File; -import java.io.FileFilter; -import java.io.FileInputStream; import java.io.BufferedReader; -import java.io.InputStreamReader; import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; -import java.util.Hashtable; +import java.io.InputStreamReader; import java.util.Enumeration; +import java.util.Hashtable; import java.util.Iterator; import java.util.Vector; import de.anomic.server.logging.serverLog; -import de.anomic.data.listManager; /** * Wordlist based translator diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java index bbf87c0e7..6004bbd30 100644 --- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java @@ -1 +1 @@ -// htmlFilterAbstractScraper.java // --------------------------- // (C) by Michael Peter Christen; mc@anomic.de // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // last major change: 18.02.2004 // // You agree that the Author(s) is (are) not responsible for cost, // loss of data or any harm that may be caused by usage of this softare or // this documentation. The usage of this software is on your own risk. The // installation and usage (starting/running) of this software may allow other // people or application to access your computer and any attached devices and // is highly dependent on the configuration of the software which must be // done by the user of the software;the author(s) is (are) also // not responsible for proper configuration and usage of the software, even // if provoked by documentation provided together with the software. // // THE SOFTWARE THAT FOLLOWS AS ART OF PROGRAMMING BELOW THIS SECTION // IS PUBLISHED UNDER THE GPL AS DOCUMENTED IN THE FILE gpl.txt ASIDE THIS // FILE AND AS IN http://www.gnu.org/licenses/gpl.txt // ANY CHANGES TO THIS FILE ACCORDING TO THE GPL CAN BE DONE TO THE // LINES THAT FOLLOWS THIS COPYRIGHT NOTICE HERE, BUT CHANGES MUST NOT // BE DONE ABOVE OR INSIDE THE COPYRIGHT NOTICE. A RE-DISTRIBUTION // MUST CONTAIN THE INTACT AND UNCHANGED COPYRIGHT NOTICE. // CONTRIBUTIONS AND CHANGES TO THE PROGRAM CODE SHOULD BE MARKED AS SUCH. package de.anomic.htmlFilter; import java.util.TreeSet; import java.util.HashMap; import java.util.Properties; import de.anomic.server.serverByteBuffer; public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { public static final byte lb = (byte) '<'; public static final byte rb = (byte) '>'; public static final byte sl = (byte) '/'; private TreeSet tags0; private TreeSet tags1; // define a translation table for html character codings private static HashMap trans = new HashMap(300); static { trans.put(""", "\""); //Anführungszeichen oben trans.put("&", "&"); //Ampersand-Zeichen, kaufmännisches Und trans.put("<", "<"); //öffnende spitze Klammer trans.put(">", ">"); //schließende spitze Klammer trans.put(" ", " "); //Erzwungenes Leerzeichen trans.put("¡", "!"); //umgekehrtes Ausrufezeichen trans.put("¢", " cent "); //Cent-Zeichen trans.put("£", " pound "); //Pfund-Zeichen trans.put("¤", " currency "); //Währungs-Zeichen trans.put("¥", " yen "); //Yen-Zeichen trans.put("¦", " "); //durchbrochener Strich trans.put("§", " paragraph "); //Paragraph-Zeichen trans.put("¨", " "); //Pünktchen oben trans.put("©", " copyright "); //Copyright-Zeichen trans.put("ª", " "); //Ordinal-Zeichen weiblich trans.put("«", " "); //angewinkelte Anführungszeichen links trans.put("¬", " not "); //Verneinungs-Zeichen trans.put("­", "-"); //kurzer Trennstrich trans.put("®", " trademark "); //Registriermarke-Zeichen trans.put("¯", " "); //Überstrich trans.put("°", " degree "); //Grad-Zeichen trans.put("±", " +/- "); //Plusminus-Zeichen trans.put("²", " square "); //Hoch-2-Zeichen trans.put("³", " 3 "); //Hoch-3-Zeichen trans.put("´", " "); //Acute-Zeichen trans.put("µ", " micro "); //Mikro-Zeichen trans.put("¶", " paragraph "); //Absatz-Zeichen trans.put("·", " "); //Mittelpunkt trans.put("¸", " "); //Häkchen unten trans.put("¹", " "); //Hoch-1-Zeichen trans.put("º", " degree "); //Ordinal-Zeichen männlich trans.put("»", " "); //angewinkelte Anführungszeichen rechts trans.put("¼", " quarter "); //ein Viertel trans.put("½", " half "); //ein Halb trans.put("¾", " 3/4 "); //drei Viertel trans.put("¿", "?"); //umgekehrtes Fragezeichen trans.put("À", "A"); //A mit Accent grave trans.put("Á", "A"); //A mit Accent acute trans.put("Â", "A"); //A mit Circumflex trans.put("Ã", "A"); //A mit Tilde trans.put("Ä", "Ae"); //A Umlaut trans.put("Å", "A"); //A mit Ring trans.put("Æ", "A"); //A mit legiertem E trans.put("Ç", "C"); //C mit Häkchen trans.put("È", "E"); //E mit Accent grave trans.put("É", "E"); //E mit Accent acute trans.put("Ê", "E"); //E mit Circumflex trans.put("Ë", "E"); //E Umlaut trans.put("Ì", "I"); //I mit Accent grave trans.put("Í", "I"); //I mit Accent acute trans.put("Î", "I"); //I mit Circumflex trans.put("Ï", "I"); //I Umlaut trans.put("Ð", "D"); //Eth (isländisch) trans.put("Ñ", "N"); //N mit Tilde trans.put("Ò", "O"); //O mit Accent grave trans.put("Ó", "O"); //O mit Accent acute trans.put("Ô", "O"); //O mit Circumflex trans.put("Õ", "O"); //O mit Tilde trans.put("Ö", "Oe"); //O Umlaut trans.put("×", " times "); //Mal-Zeichen trans.put("Ø", "O"); //O mit Schrägstrich trans.put("Ù", "U"); //U mit Accent grave trans.put("Ú", "U"); //U mit Accent acute trans.put("Û", "U"); //U mit Circumflex trans.put("Ü", "Ue"); //U Umlaut trans.put("Ý", "Y"); //Y mit Accent acute trans.put("Þ", "P"); //THORN (isländisch) trans.put("ß", "ss"); //scharfes S trans.put("à", "a"); //a mit Accent grave trans.put("á", "a"); //a mit Accent acute trans.put("â", "a"); //a mit Circumflex trans.put("ã", "a"); //a mit Tilde trans.put("ä", "ae"); //a Umlaut trans.put("å", "a"); //a mit Ring trans.put("æ", "a"); //a mit legiertem e trans.put("ç", "c"); //c mit Häkchen trans.put("è", "e"); //e mit Accent grave trans.put("é", "e"); //e mit Accent acute trans.put("ê", "e"); //e mit Circumflex trans.put("ë", "e"); //e Umlaut trans.put("ì", "i"); //i mit Accent grave trans.put("í", "i"); //i mit Accent acute trans.put("î", "i"); //i mit Circumflex trans.put("ï", "i"); //i Umlaut trans.put("ð", "d"); //eth (isländisch) trans.put("ñ", "n"); //n mit Tilde trans.put("ò", "o"); //o mit Accent grave trans.put("ó", "o"); //o mit Accent acute trans.put("ô", "o"); //o mit Circumflex trans.put("õ", "o"); //o mit Tilde trans.put("ö", "oe"); //o Umlaut trans.put("÷", "%"); //Divisions-Zeichen trans.put("ø", "o"); //o mit Schrägstrich trans.put("ù", "u"); //u mit Accent grave trans.put("ú", "u"); //u mit Accent acute trans.put("û", "u"); //u mit Circumflex trans.put("ü", "ue"); //u Umlaut trans.put("ý", "y"); //y mit Accent acute trans.put("þ", "p"); //thorn (isländisch) trans.put("ÿ", "y"); //y Umlaut trans.put("Α", " Alpha "); //Alpha groß trans.put("α", " alpha "); //alpha klein trans.put("Β", " Beta "); //Beta groß trans.put("β", " beta "); //beta klein trans.put("Γ", " Gamma "); //Gamma groß trans.put("γ", " gamma "); //gamma klein trans.put("Δ", " Delta "); //Delta groß trans.put("δ", " delta "); //delta klein trans.put("Ε", " Epsilon "); //Epsilon groß trans.put("ε", " epsilon "); //epsilon klein trans.put("Ζ", " Zeta "); //Zeta groß trans.put("ζ", " zeta "); //zeta klein trans.put("Η", " Eta "); //Eta groß trans.put("η", " eta "); //eta klein trans.put("Θ", " Theta "); //Theta groß trans.put("θ", " theta "); //theta klein trans.put("Ι", " Iota "); //Iota groß trans.put("ι", " iota "); //iota klein trans.put("Κ", " Kappa "); //Kappa groß trans.put("κ", " kappa "); //kappa klein trans.put("Λ", " Lambda "); //Lambda groß trans.put("λ", " lambda "); //lambda klein trans.put("Μ", " Mu "); //Mu groß trans.put("μ", " mu "); //mu klein trans.put("Ν", " Nu "); //Nu groß trans.put("ν", " nu "); //nu klein trans.put("Ξ", " Xi "); //Xi groß trans.put("ξ", " xi "); //xi klein trans.put("Ο", " Omicron "); //Omicron groß trans.put("ο", " omicron "); //omicron klein trans.put("Π", " Pi "); //Pi groß trans.put("π", " pi "); //pi klein trans.put("Ρ", " Rho "); //Rho groß trans.put("ρ", " rho "); //rho klein trans.put("Σ", " Sigma "); //Sigma groß trans.put("ς", " sigma "); //sigmaf klein trans.put("σ", " sigma "); //sigma klein trans.put("Τ", " Tau "); //Tau groß trans.put("τ", " tau "); //tau klein trans.put("Υ", " Ypsilon "); //Upsilon groß trans.put("υ", " ypsilon "); //upsilon klein trans.put("Φ", " Phi "); //Phi groß trans.put("φ", " phi "); //phi klein trans.put("Χ", " Chi "); //Chi groß trans.put("χ", " chi "); //chi klein trans.put("Ψ", " Psi "); //Psi groß trans.put("ψ", " psi "); //psi klein trans.put("Ω", " Omega "); //Omega groß trans.put("ω", " omega "); //omega klein trans.put("ϑ", " theta "); //theta Symbol trans.put("ϒ", " ypsilon "); //upsilon mit Haken trans.put("ϖ", " pi "); //pi Symbol trans.put("∀", " for all "); //für alle trans.put("∂", " part of "); //teilweise trans.put("∃", " exists "); //existiert trans.put("∅", " null "); //leer trans.put("∇", " nabla "); //nabla trans.put("∈", " element of "); //Element von trans.put("∉", " not element of "); //kein Element von trans.put("∋", " contains "); //enthält als Element trans.put("∏", " product "); //Produkt trans.put("∑", " sum "); //Summe trans.put("−", " minus "); //minus trans.put("∗", " times "); //Asterisk trans.put("√", " sqare root "); //Quadratwurzel trans.put("∝", " proportional to "); //proportional zu trans.put("∞", " unlimited "); //unendlich trans.put("∠", " angle "); //Winkel trans.put("∧", " and "); //und trans.put("∨", " or "); //oder trans.put("∩", " "); //Schnittpunkt trans.put("∪", " unity "); //Einheit trans.put("∫", " integral "); //Integral trans.put("∴", " cause "); //deshalb trans.put("∼", " similar to "); //ähnlich wie trans.put("≅", " equal "); //annähernd gleich trans.put("≈", " equal "); //beinahe gleich trans.put("≠", " not equal "); //ungleich trans.put("≡", " identical "); //identisch mit trans.put("≤", " smaller or equal than "); //kleiner gleich trans.put("≥", " greater or equal than "); //größer gleich trans.put("⊂", " subset of "); //Untermenge von trans.put("⊃", " superset of "); //Obermenge von trans.put("⊄", " not subset of "); //keine Untermenge von trans.put("⊆", ""); //Untermenge von oder gleich mit trans.put("⊇", ""); //Obermenge von oder gleich mit trans.put("⊕", ""); //Direktsumme trans.put("⊗", ""); //Vektorprodukt trans.put("⊥", ""); //senkrecht zu trans.put("⋅", ""); //Punkt-Operator trans.put("◊", ""); //Raute trans.put("⌈", ""); //links oben trans.put("⌉", ""); //rechts oben trans.put("⌊", ""); //links unten trans.put("⌋", ""); //rechts unten trans.put("⟨", ""); //spitze Klammer links trans.put("⟩", ""); //spitze Klammer rechts trans.put("←", ""); //Pfeil links trans.put("↑", ""); //Pfeil oben trans.put("→", ""); //Pfeil rechts trans.put("↓", ""); //Pfeil unten trans.put("↔", ""); //Pfeil links/rechts trans.put("↵", ""); //Pfeil unten-Knick-links trans.put("⇐", ""); //Doppelpfeil links trans.put("⇑", ""); //Doppelpfeil oben trans.put("⇒", ""); //Doppelpfeil rechts trans.put("⇓", ""); //Doppelpfeil unten trans.put("⇔", ""); //Doppelpfeil links/rechts trans.put("•", ""); //Bullet-Zeichen trans.put("…", ""); //Horizontale Ellipse trans.put("′", ""); //Minutenzeichen trans.put("‾", ""); //Überstrich trans.put("⁄", ""); //Bruchstrich trans.put("℘", ""); //Weierstrass p trans.put("ℑ", ""); //Zeichen für "imaginär" trans.put("ℜ", ""); //Zeichen für "real" trans.put("™", ""); //Trademark-Zeichen trans.put("€", ""); //Euro-Zeichen trans.put("ℵ", ""); //Alef-Symbol trans.put("♠", ""); //Pik-Zeichen trans.put("♣", ""); //Kreuz-Zeichen trans.put("♥", ""); //Herz-Zeichen trans.put("♦", ""); //Karo-Zeichen trans.put(" ", ""); //Leerzeichen Breite n trans.put(" ", ""); //Leerzeichen Breite m trans.put(" ", ""); //Schmales Leerzeichen trans.put("‌", ""); //null breiter Nichtverbinder trans.put("‍", ""); //null breiter Verbinder trans.put("‎", ""); //links-nach-rechts-Zeichen trans.put("‏", ""); //rechts-nach-links-Zeichen trans.put("–", ""); //Gedankenstrich Breite n trans.put("—", ""); //Gedankenstrich Breite m trans.put("‘", ""); //einfaches Anführungszeichen links trans.put("’", ""); //einfaches Anführungszeichen rechts trans.put("‚", ""); //einfaches low-9-Zeichen trans.put("“", ""); //doppeltes Anführungszeichen links trans.put("”", ""); //doppeltes Anführungszeichen rechts trans.put("„", ""); //doppeltes low-9-Zeichen rechts trans.put("†", ""); //Kreuz trans.put("‡", ""); //Doppelkreuz trans.put("‰", ""); //zu tausend trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts } public htmlFilterAbstractScraper(TreeSet tags0, TreeSet tags1) { this.tags0 = tags0; this.tags1 = tags1; } public boolean isTag0(String tag) { return (tags0 != null) && (tags0.contains(tag)); } public boolean isTag1(String tag) { return (tags1 != null) && (tags1.contains(tag)); } //the 'missing' method that shall be implemented: public abstract void scrapeText(byte[] text); // the other methods must take into account to construct the return value correctly public void scrapeTag0(String tagname, Properties tagopts) { } public void scrapeTag1(String tagname, Properties tagopts, byte[] text) { } // string conversions private static String code_iso8859s(int c) { switch ((int) c & 0xff) { // german umlaute and ligaturen case 0xc4: return "AE"; case 0xd6: return "OE"; case 0xdc: return "UE"; case 0xe4: return "ae"; case 0xf6: return "oe"; case 0xfc: return "ue"; case 0xdf: return "ss"; // accent on letters; i.e. french characters case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc5: return "A"; case 0xc6: return "AE"; case 0xc7: return "C"; case 0xc8: case 0xc9: case 0xca: return "E"; case 0xcc: case 0xcd: case 0xce: case 0xcf: return "I"; case 0xd0: return "D"; case 0xd1: return "N"; case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd8: return "O"; case 0xd7: return "x"; case 0xd9: case 0xda: case 0xdb: return "U"; case 0xdd: return "Y"; case 0xde: return "p"; case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe5: return "a"; case 0xe6: return "ae"; case 0xe7: return "c"; case 0xe8: case 0xe9: case 0xea: return "e"; case 0xec: case 0xed: case 0xee: case 0xef: return "i"; case 0xf0: return "d"; case 0xf1: return "n"; case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf8: return "o"; case 0xf7: return "%"; case 0xf9: case 0xfa: case 0xfb: return "u"; case 0xfd: case 0xff: return "y"; case 0xfe: return "p"; // special characters case 0xa4: return " euro "; default: return null; } } public static serverByteBuffer convertUmlaute(serverByteBuffer bb) { serverByteBuffer t = new serverByteBuffer(bb.length() + 20); int b0, b1, b2; String z; int i = 0; while (i < bb.length()) { b0 = bb.byteAt(i) & 0xff; // check utf-8 encoding if ((b0 < 128) || (i + 1 == bb.length())) { t.append(b0); i++; } else { b1 = bb.byteAt(i + 1) & 0xff; if (b1 > 0x3f) { z = code_iso8859s(b0); i++; } else if ((b0 > 0xbf) && (b0 < 0xe0)) { z = code_iso8859s(((b0 & 0x1f) << 0x6) | (b1 & 0x3f)); i += 2; } else { if (i + 2 >= bb.length()) { z = null; i++; } else { b2 = bb.byteAt(i + 2) & 0xff; if (b2 > 0x3f) { z = code_iso8859s(b0); i++; } else { z = code_iso8859s(((b0 & 0xf) << 0xc) | ((b1 & 0x3f) << 0x6) | (b2 & 0x3f)); i += 3; } } } if (z == null) t.append(b0); else t.append(z); } } return t; } private static byte[] transscript(byte[] code) { String t = (String) trans.get(new String(code)); if (t == null) return new byte[0]; else return t.getBytes(); } protected static serverByteBuffer transscriptAll(serverByteBuffer bb) { int p0 = 0, p1; byte[] t; while ((p0 = bb.indexOf((byte) '&', p0)) >= 0) { p1 = bb.indexOf((byte) ';', p0); if (p1 >= 0) { t = transscript(bb.getBytes(p0, p1 + 1)); bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getBytes(p1 + 1)); } else { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).append(bb.getBytes(p0 + 1)); } } t = null; return bb; } protected static serverByteBuffer stripAllTags(serverByteBuffer bb) { int p0 = 0, p1; while ((p0 = bb.indexOf(lb, p0)) >= 0) { p1 = bb.indexOf(rb, p0); if (p1 >= 0) { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + 1).trim().append((byte) 32).append(new serverByteBuffer(bb.getBytes(p1 + 1)).trim()); } else { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).trim().append(new serverByteBuffer(bb.getBytes(p0 + 1)).trim()); } } return bb.trim(); } public static serverByteBuffer stripAll(serverByteBuffer bb) { //return stripAllTags(s); return convertUmlaute(transscriptAll(stripAllTags(bb))); } public void close() { // free resources tags0 = null; tags1 = null; } public void finalize() { close(); } } \ No newline at end of file +// htmlFilterAbstractScraper.java // --------------------------- // (C) by Michael Peter Christen; mc@anomic.de // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // last major change: 18.02.2004 // // You agree that the Author(s) is (are) not responsible for cost, // loss of data or any harm that may be caused by usage of this softare or // this documentation. The usage of this software is on your own risk. The // installation and usage (starting/running) of this software may allow other // people or application to access your computer and any attached devices and // is highly dependent on the configuration of the software which must be // done by the user of the software;the author(s) is (are) also // not responsible for proper configuration and usage of the software, even // if provoked by documentation provided together with the software. // // THE SOFTWARE THAT FOLLOWS AS ART OF PROGRAMMING BELOW THIS SECTION // IS PUBLISHED UNDER THE GPL AS DOCUMENTED IN THE FILE gpl.txt ASIDE THIS // FILE AND AS IN http://www.gnu.org/licenses/gpl.txt // ANY CHANGES TO THIS FILE ACCORDING TO THE GPL CAN BE DONE TO THE // LINES THAT FOLLOWS THIS COPYRIGHT NOTICE HERE, BUT CHANGES MUST NOT // BE DONE ABOVE OR INSIDE THE COPYRIGHT NOTICE. A RE-DISTRIBUTION // MUST CONTAIN THE INTACT AND UNCHANGED COPYRIGHT NOTICE. // CONTRIBUTIONS AND CHANGES TO THE PROGRAM CODE SHOULD BE MARKED AS SUCH. package de.anomic.htmlFilter; import java.util.HashMap; import java.util.Properties; import java.util.TreeSet; import de.anomic.server.serverByteBuffer; public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { public static final byte lb = (byte) '<'; public static final byte rb = (byte) '>'; public static final byte sl = (byte) '/'; private TreeSet tags0; private TreeSet tags1; // define a translation table for html character codings private static HashMap trans = new HashMap(300); static { trans.put(""", "\""); //Anführungszeichen oben trans.put("&", "&"); //Ampersand-Zeichen, kaufmännisches Und trans.put("<", "<"); //öffnende spitze Klammer trans.put(">", ">"); //schließende spitze Klammer trans.put(" ", " "); //Erzwungenes Leerzeichen trans.put("¡", "!"); //umgekehrtes Ausrufezeichen trans.put("¢", " cent "); //Cent-Zeichen trans.put("£", " pound "); //Pfund-Zeichen trans.put("¤", " currency "); //Währungs-Zeichen trans.put("¥", " yen "); //Yen-Zeichen trans.put("¦", " "); //durchbrochener Strich trans.put("§", " paragraph "); //Paragraph-Zeichen trans.put("¨", " "); //Pünktchen oben trans.put("©", " copyright "); //Copyright-Zeichen trans.put("ª", " "); //Ordinal-Zeichen weiblich trans.put("«", " "); //angewinkelte Anführungszeichen links trans.put("¬", " not "); //Verneinungs-Zeichen trans.put("­", "-"); //kurzer Trennstrich trans.put("®", " trademark "); //Registriermarke-Zeichen trans.put("¯", " "); //Überstrich trans.put("°", " degree "); //Grad-Zeichen trans.put("±", " +/- "); //Plusminus-Zeichen trans.put("²", " square "); //Hoch-2-Zeichen trans.put("³", " 3 "); //Hoch-3-Zeichen trans.put("´", " "); //Acute-Zeichen trans.put("µ", " micro "); //Mikro-Zeichen trans.put("¶", " paragraph "); //Absatz-Zeichen trans.put("·", " "); //Mittelpunkt trans.put("¸", " "); //Häkchen unten trans.put("¹", " "); //Hoch-1-Zeichen trans.put("º", " degree "); //Ordinal-Zeichen männlich trans.put("»", " "); //angewinkelte Anführungszeichen rechts trans.put("¼", " quarter "); //ein Viertel trans.put("½", " half "); //ein Halb trans.put("¾", " 3/4 "); //drei Viertel trans.put("¿", "?"); //umgekehrtes Fragezeichen trans.put("À", "A"); //A mit Accent grave trans.put("Á", "A"); //A mit Accent acute trans.put("Â", "A"); //A mit Circumflex trans.put("Ã", "A"); //A mit Tilde trans.put("Ä", "Ae"); //A Umlaut trans.put("Å", "A"); //A mit Ring trans.put("Æ", "A"); //A mit legiertem E trans.put("Ç", "C"); //C mit Häkchen trans.put("È", "E"); //E mit Accent grave trans.put("É", "E"); //E mit Accent acute trans.put("Ê", "E"); //E mit Circumflex trans.put("Ë", "E"); //E Umlaut trans.put("Ì", "I"); //I mit Accent grave trans.put("Í", "I"); //I mit Accent acute trans.put("Î", "I"); //I mit Circumflex trans.put("Ï", "I"); //I Umlaut trans.put("Ð", "D"); //Eth (isländisch) trans.put("Ñ", "N"); //N mit Tilde trans.put("Ò", "O"); //O mit Accent grave trans.put("Ó", "O"); //O mit Accent acute trans.put("Ô", "O"); //O mit Circumflex trans.put("Õ", "O"); //O mit Tilde trans.put("Ö", "Oe"); //O Umlaut trans.put("×", " times "); //Mal-Zeichen trans.put("Ø", "O"); //O mit Schrägstrich trans.put("Ù", "U"); //U mit Accent grave trans.put("Ú", "U"); //U mit Accent acute trans.put("Û", "U"); //U mit Circumflex trans.put("Ü", "Ue"); //U Umlaut trans.put("Ý", "Y"); //Y mit Accent acute trans.put("Þ", "P"); //THORN (isländisch) trans.put("ß", "ss"); //scharfes S trans.put("à", "a"); //a mit Accent grave trans.put("á", "a"); //a mit Accent acute trans.put("â", "a"); //a mit Circumflex trans.put("ã", "a"); //a mit Tilde trans.put("ä", "ae"); //a Umlaut trans.put("å", "a"); //a mit Ring trans.put("æ", "a"); //a mit legiertem e trans.put("ç", "c"); //c mit Häkchen trans.put("è", "e"); //e mit Accent grave trans.put("é", "e"); //e mit Accent acute trans.put("ê", "e"); //e mit Circumflex trans.put("ë", "e"); //e Umlaut trans.put("ì", "i"); //i mit Accent grave trans.put("í", "i"); //i mit Accent acute trans.put("î", "i"); //i mit Circumflex trans.put("ï", "i"); //i Umlaut trans.put("ð", "d"); //eth (isländisch) trans.put("ñ", "n"); //n mit Tilde trans.put("ò", "o"); //o mit Accent grave trans.put("ó", "o"); //o mit Accent acute trans.put("ô", "o"); //o mit Circumflex trans.put("õ", "o"); //o mit Tilde trans.put("ö", "oe"); //o Umlaut trans.put("÷", "%"); //Divisions-Zeichen trans.put("ø", "o"); //o mit Schrägstrich trans.put("ù", "u"); //u mit Accent grave trans.put("ú", "u"); //u mit Accent acute trans.put("û", "u"); //u mit Circumflex trans.put("ü", "ue"); //u Umlaut trans.put("ý", "y"); //y mit Accent acute trans.put("þ", "p"); //thorn (isländisch) trans.put("ÿ", "y"); //y Umlaut trans.put("Α", " Alpha "); //Alpha groß trans.put("α", " alpha "); //alpha klein trans.put("Β", " Beta "); //Beta groß trans.put("β", " beta "); //beta klein trans.put("Γ", " Gamma "); //Gamma groß trans.put("γ", " gamma "); //gamma klein trans.put("Δ", " Delta "); //Delta groß trans.put("δ", " delta "); //delta klein trans.put("Ε", " Epsilon "); //Epsilon groß trans.put("ε", " epsilon "); //epsilon klein trans.put("Ζ", " Zeta "); //Zeta groß trans.put("ζ", " zeta "); //zeta klein trans.put("Η", " Eta "); //Eta groß trans.put("η", " eta "); //eta klein trans.put("Θ", " Theta "); //Theta groß trans.put("θ", " theta "); //theta klein trans.put("Ι", " Iota "); //Iota groß trans.put("ι", " iota "); //iota klein trans.put("Κ", " Kappa "); //Kappa groß trans.put("κ", " kappa "); //kappa klein trans.put("Λ", " Lambda "); //Lambda groß trans.put("λ", " lambda "); //lambda klein trans.put("Μ", " Mu "); //Mu groß trans.put("μ", " mu "); //mu klein trans.put("Ν", " Nu "); //Nu groß trans.put("ν", " nu "); //nu klein trans.put("Ξ", " Xi "); //Xi groß trans.put("ξ", " xi "); //xi klein trans.put("Ο", " Omicron "); //Omicron groß trans.put("ο", " omicron "); //omicron klein trans.put("Π", " Pi "); //Pi groß trans.put("π", " pi "); //pi klein trans.put("Ρ", " Rho "); //Rho groß trans.put("ρ", " rho "); //rho klein trans.put("Σ", " Sigma "); //Sigma groß trans.put("ς", " sigma "); //sigmaf klein trans.put("σ", " sigma "); //sigma klein trans.put("Τ", " Tau "); //Tau groß trans.put("τ", " tau "); //tau klein trans.put("Υ", " Ypsilon "); //Upsilon groß trans.put("υ", " ypsilon "); //upsilon klein trans.put("Φ", " Phi "); //Phi groß trans.put("φ", " phi "); //phi klein trans.put("Χ", " Chi "); //Chi groß trans.put("χ", " chi "); //chi klein trans.put("Ψ", " Psi "); //Psi groß trans.put("ψ", " psi "); //psi klein trans.put("Ω", " Omega "); //Omega groß trans.put("ω", " omega "); //omega klein trans.put("ϑ", " theta "); //theta Symbol trans.put("ϒ", " ypsilon "); //upsilon mit Haken trans.put("ϖ", " pi "); //pi Symbol trans.put("∀", " for all "); //für alle trans.put("∂", " part of "); //teilweise trans.put("∃", " exists "); //existiert trans.put("∅", " null "); //leer trans.put("∇", " nabla "); //nabla trans.put("∈", " element of "); //Element von trans.put("∉", " not element of "); //kein Element von trans.put("∋", " contains "); //enthält als Element trans.put("∏", " product "); //Produkt trans.put("∑", " sum "); //Summe trans.put("−", " minus "); //minus trans.put("∗", " times "); //Asterisk trans.put("√", " sqare root "); //Quadratwurzel trans.put("∝", " proportional to "); //proportional zu trans.put("∞", " unlimited "); //unendlich trans.put("∠", " angle "); //Winkel trans.put("∧", " and "); //und trans.put("∨", " or "); //oder trans.put("∩", " "); //Schnittpunkt trans.put("∪", " unity "); //Einheit trans.put("∫", " integral "); //Integral trans.put("∴", " cause "); //deshalb trans.put("∼", " similar to "); //ähnlich wie trans.put("≅", " equal "); //annähernd gleich trans.put("≈", " equal "); //beinahe gleich trans.put("≠", " not equal "); //ungleich trans.put("≡", " identical "); //identisch mit trans.put("≤", " smaller or equal than "); //kleiner gleich trans.put("≥", " greater or equal than "); //größer gleich trans.put("⊂", " subset of "); //Untermenge von trans.put("⊃", " superset of "); //Obermenge von trans.put("⊄", " not subset of "); //keine Untermenge von trans.put("⊆", ""); //Untermenge von oder gleich mit trans.put("⊇", ""); //Obermenge von oder gleich mit trans.put("⊕", ""); //Direktsumme trans.put("⊗", ""); //Vektorprodukt trans.put("⊥", ""); //senkrecht zu trans.put("⋅", ""); //Punkt-Operator trans.put("◊", ""); //Raute trans.put("⌈", ""); //links oben trans.put("⌉", ""); //rechts oben trans.put("⌊", ""); //links unten trans.put("⌋", ""); //rechts unten trans.put("⟨", ""); //spitze Klammer links trans.put("⟩", ""); //spitze Klammer rechts trans.put("←", ""); //Pfeil links trans.put("↑", ""); //Pfeil oben trans.put("→", ""); //Pfeil rechts trans.put("↓", ""); //Pfeil unten trans.put("↔", ""); //Pfeil links/rechts trans.put("↵", ""); //Pfeil unten-Knick-links trans.put("⇐", ""); //Doppelpfeil links trans.put("⇑", ""); //Doppelpfeil oben trans.put("⇒", ""); //Doppelpfeil rechts trans.put("⇓", ""); //Doppelpfeil unten trans.put("⇔", ""); //Doppelpfeil links/rechts trans.put("•", ""); //Bullet-Zeichen trans.put("…", ""); //Horizontale Ellipse trans.put("′", ""); //Minutenzeichen trans.put("‾", ""); //Überstrich trans.put("⁄", ""); //Bruchstrich trans.put("℘", ""); //Weierstrass p trans.put("ℑ", ""); //Zeichen für "imaginär" trans.put("ℜ", ""); //Zeichen für "real" trans.put("™", ""); //Trademark-Zeichen trans.put("€", ""); //Euro-Zeichen trans.put("ℵ", ""); //Alef-Symbol trans.put("♠", ""); //Pik-Zeichen trans.put("♣", ""); //Kreuz-Zeichen trans.put("♥", ""); //Herz-Zeichen trans.put("♦", ""); //Karo-Zeichen trans.put(" ", ""); //Leerzeichen Breite n trans.put(" ", ""); //Leerzeichen Breite m trans.put(" ", ""); //Schmales Leerzeichen trans.put("‌", ""); //null breiter Nichtverbinder trans.put("‍", ""); //null breiter Verbinder trans.put("‎", ""); //links-nach-rechts-Zeichen trans.put("‏", ""); //rechts-nach-links-Zeichen trans.put("–", ""); //Gedankenstrich Breite n trans.put("—", ""); //Gedankenstrich Breite m trans.put("‘", ""); //einfaches Anführungszeichen links trans.put("’", ""); //einfaches Anführungszeichen rechts trans.put("‚", ""); //einfaches low-9-Zeichen trans.put("“", ""); //doppeltes Anführungszeichen links trans.put("”", ""); //doppeltes Anführungszeichen rechts trans.put("„", ""); //doppeltes low-9-Zeichen rechts trans.put("†", ""); //Kreuz trans.put("‡", ""); //Doppelkreuz trans.put("‰", ""); //zu tausend trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts } public htmlFilterAbstractScraper(TreeSet tags0, TreeSet tags1) { this.tags0 = tags0; this.tags1 = tags1; } public boolean isTag0(String tag) { return (tags0 != null) && (tags0.contains(tag)); } public boolean isTag1(String tag) { return (tags1 != null) && (tags1.contains(tag)); } //the 'missing' method that shall be implemented: public abstract void scrapeText(byte[] text); // the other methods must take into account to construct the return value correctly public void scrapeTag0(String tagname, Properties tagopts) { } public void scrapeTag1(String tagname, Properties tagopts, byte[] text) { } // string conversions private static String code_iso8859s(int c) { switch ((int) c & 0xff) { // german umlaute and ligaturen case 0xc4: return "AE"; case 0xd6: return "OE"; case 0xdc: return "UE"; case 0xe4: return "ae"; case 0xf6: return "oe"; case 0xfc: return "ue"; case 0xdf: return "ss"; // accent on letters; i.e. french characters case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc5: return "A"; case 0xc6: return "AE"; case 0xc7: return "C"; case 0xc8: case 0xc9: case 0xca: return "E"; case 0xcc: case 0xcd: case 0xce: case 0xcf: return "I"; case 0xd0: return "D"; case 0xd1: return "N"; case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd8: return "O"; case 0xd7: return "x"; case 0xd9: case 0xda: case 0xdb: return "U"; case 0xdd: return "Y"; case 0xde: return "p"; case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe5: return "a"; case 0xe6: return "ae"; case 0xe7: return "c"; case 0xe8: case 0xe9: case 0xea: return "e"; case 0xec: case 0xed: case 0xee: case 0xef: return "i"; case 0xf0: return "d"; case 0xf1: return "n"; case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf8: return "o"; case 0xf7: return "%"; case 0xf9: case 0xfa: case 0xfb: return "u"; case 0xfd: case 0xff: return "y"; case 0xfe: return "p"; // special characters case 0xa4: return " euro "; default: return null; } } public static serverByteBuffer convertUmlaute(serverByteBuffer bb) { serverByteBuffer t = new serverByteBuffer(bb.length() + 20); int b0, b1, b2; String z; int i = 0; while (i < bb.length()) { b0 = bb.byteAt(i) & 0xff; // check utf-8 encoding if ((b0 < 128) || (i + 1 == bb.length())) { t.append(b0); i++; } else { b1 = bb.byteAt(i + 1) & 0xff; if (b1 > 0x3f) { z = code_iso8859s(b0); i++; } else if ((b0 > 0xbf) && (b0 < 0xe0)) { z = code_iso8859s(((b0 & 0x1f) << 0x6) | (b1 & 0x3f)); i += 2; } else { if (i + 2 >= bb.length()) { z = null; i++; } else { b2 = bb.byteAt(i + 2) & 0xff; if (b2 > 0x3f) { z = code_iso8859s(b0); i++; } else { z = code_iso8859s(((b0 & 0xf) << 0xc) | ((b1 & 0x3f) << 0x6) | (b2 & 0x3f)); i += 3; } } } if (z == null) t.append(b0); else t.append(z); } } return t; } private static byte[] transscript(byte[] code) { String t = (String) trans.get(new String(code)); if (t == null) return new byte[0]; else return t.getBytes(); } protected static serverByteBuffer transscriptAll(serverByteBuffer bb) { int p0 = 0, p1; byte[] t; while ((p0 = bb.indexOf((byte) '&', p0)) >= 0) { p1 = bb.indexOf((byte) ';', p0); if (p1 >= 0) { t = transscript(bb.getBytes(p0, p1 + 1)); bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getBytes(p1 + 1)); } else { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).append(bb.getBytes(p0 + 1)); } } t = null; return bb; } protected static serverByteBuffer stripAllTags(serverByteBuffer bb) { int p0 = 0, p1; while ((p0 = bb.indexOf(lb, p0)) >= 0) { p1 = bb.indexOf(rb, p0); if (p1 >= 0) { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + 1).trim().append((byte) 32).append(new serverByteBuffer(bb.getBytes(p1 + 1)).trim()); } else { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).trim().append(new serverByteBuffer(bb.getBytes(p0 + 1)).trim()); } } return bb.trim(); } public static serverByteBuffer stripAll(serverByteBuffer bb) { //return stripAllTags(s); return convertUmlaute(transscriptAll(stripAllTags(bb))); } public void close() { // free resources tags0 = null; tags1 = null; } public void finalize() { close(); } } \ No newline at end of file diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractTransformer.java b/source/de/anomic/htmlFilter/htmlFilterAbstractTransformer.java index 3d36e4a36..fa8bcdc91 100644 --- a/source/de/anomic/htmlFilter/htmlFilterAbstractTransformer.java +++ b/source/de/anomic/htmlFilter/htmlFilterAbstractTransformer.java @@ -40,8 +40,8 @@ package de.anomic.htmlFilter; -import java.util.TreeSet; import java.util.Properties; +import java.util.TreeSet; public abstract class htmlFilterAbstractTransformer implements htmlFilterTransformer { diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 7fe778d79..a868d955d 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -40,14 +40,14 @@ package de.anomic.htmlFilter; -import java.net.URL; import java.net.MalformedURLException; +import java.net.URL; +import java.text.Collator; import java.util.HashMap; -import java.util.TreeSet; +import java.util.Locale; import java.util.Map; import java.util.Properties; -import java.util.Locale; -import java.text.Collator; +import java.util.TreeSet; import de.anomic.server.serverByteBuffer; diff --git a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java index 0d9358131..563dc1170 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java @@ -43,11 +43,11 @@ package de.anomic.htmlFilter; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; -import java.util.TreeSet; +import java.text.Collator; +import java.util.Locale; import java.util.Properties; +import java.util.TreeSet; import java.util.Vector; -import java.util.Locale; -import java.text.Collator; import de.anomic.server.serverByteBuffer; diff --git a/source/de/anomic/http/httpTemplate.java b/source/de/anomic/http/httpTemplate.java index ea118cda2..5594db320 100644 --- a/source/de/anomic/http/httpTemplate.java +++ b/source/de/anomic/http/httpTemplate.java @@ -42,16 +42,16 @@ package de.anomic.http; +import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; import java.io.OutputStream; import java.io.PushbackInputStream; -import java.io.BufferedReader; -import java.io.FileInputStream; -import java.io.InputStreamReader; -import java.io.File; import java.util.Hashtable; import de.anomic.server.serverFileUtils; diff --git a/source/de/anomic/http/httpc.java b/source/de/anomic/http/httpc.java index 90c8ad6ed..e9afc1900 100644 --- a/source/de/anomic/http/httpc.java +++ b/source/de/anomic/http/httpc.java @@ -76,8 +76,6 @@ import java.util.zip.GZIPInputStream; import javax.net.ssl.SSLSocketFactory; -import org.apache.commons.pool.impl.GenericObjectPool; - import de.anomic.server.serverByteBuffer; import de.anomic.server.serverCodings; import de.anomic.server.serverCore; @@ -85,6 +83,7 @@ import de.anomic.server.serverObjects; import de.anomic.server.logging.serverLog; import de.anomic.server.serverCore.Session; +import org.apache.commons.pool.impl.GenericObjectPool; public final class httpc { diff --git a/source/de/anomic/http/httpdHandler.java b/source/de/anomic/http/httpdHandler.java index 5995ec607..968a25d6b 100644 --- a/source/de/anomic/http/httpdHandler.java +++ b/source/de/anomic/http/httpdHandler.java @@ -57,8 +57,6 @@ import java.io.OutputStream; import java.io.PushbackInputStream; import java.util.Properties; -import de.anomic.server.serverSwitch; - public interface httpdHandler { void doGet(Properties conProp, httpHeader header, OutputStream response) throws IOException; diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index c76ff227f..080f56a4d 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -72,7 +72,6 @@ import java.net.ConnectException; import java.net.MalformedURLException; import java.net.NoRouteToHostException; import java.net.Socket; -import java.net.SocketException; import java.net.URL; import java.net.UnknownHostException; import java.util.Date; @@ -637,18 +636,20 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt if (sizeBeforeDelete == -1) { // totally fresh file cacheEntry.status = plasmaHTCache.CACHE_FILL; // it's an insert - cacheManager.stackProcess(cacheEntry, cacheArray); + cacheEntry.cacheArray = cacheArray; + cacheManager.push(cacheEntry); conProp.setProperty(httpd.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_MISS"); } else if (sizeBeforeDelete == cacheArray.length) { // before we came here we deleted a cache entry cacheArray = null; cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD; - cacheManager.stackProcess(cacheEntry); // unnecessary update + cacheManager.push(cacheEntry); // unnecessary update conProp.setProperty(httpd.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_REF_FAIL_HIT"); } else { // before we came here we deleted a cache entry cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD; - cacheManager.stackProcess(cacheEntry, cacheArray); // necessary update, write response header to cache + cacheEntry.cacheArray = cacheArray; + cacheManager.push(cacheEntry); // necessary update, write response header to cache conProp.setProperty(httpd.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_REFRESH_MISS"); } } else { @@ -661,15 +662,15 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt if (sizeBeforeDelete == -1) { // totally fresh file cacheEntry.status = plasmaHTCache.CACHE_FILL; // it's an insert - cacheManager.stackProcess(cacheEntry); + cacheManager.push(cacheEntry); } else if (sizeBeforeDelete == cacheFile.length()) { // before we came here we deleted a cache entry cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD; - cacheManager.stackProcess(cacheEntry); // unnecessary update + cacheManager.push(cacheEntry); // unnecessary update } else { // before we came here we deleted a cache entry cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD; - cacheManager.stackProcess(cacheEntry); // necessary update, write response header to cache + cacheManager.push(cacheEntry); // necessary update, write response header to cache } // beware! all these writings will not fill the cacheEntry.cacheArray // that means they are not available for the indexer (except they are scraped before) @@ -682,11 +683,11 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt if (sizeBeforeDelete == -1) { // no old file and no load. just data passing cacheEntry.status = plasmaHTCache.CACHE_PASSING; - cacheManager.stackProcess(cacheEntry); + cacheManager.push(cacheEntry); } else { // before we came here we deleted a cache entry cacheEntry.status = plasmaHTCache.CACHE_STALE_NO_RELOAD; - cacheManager.stackProcess(cacheEntry); + cacheManager.push(cacheEntry); } } diff --git a/source/de/anomic/kelondro/kelondroMScoreCluster.java b/source/de/anomic/kelondro/kelondroMScoreCluster.java index f2b1adb3f..8390edbd9 100644 --- a/source/de/anomic/kelondro/kelondroMScoreCluster.java +++ b/source/de/anomic/kelondro/kelondroMScoreCluster.java @@ -43,8 +43,8 @@ package de.anomic.kelondro; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Iterator; -import java.util.TreeMap; import java.util.Map; +import java.util.TreeMap; public class kelondroMScoreCluster { diff --git a/source/de/anomic/kelondro/kelondroMap.java b/source/de/anomic/kelondro/kelondroMap.java index 688994c8b..c8977f5e1 100644 --- a/source/de/anomic/kelondro/kelondroMap.java +++ b/source/de/anomic/kelondro/kelondroMap.java @@ -45,8 +45,6 @@ package de.anomic.kelondro; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; -import java.util.LinkedList; -import java.util.ListIterator; import java.util.Map; public class kelondroMap { diff --git a/source/de/anomic/kelondro/kelondroMergeIterator.java b/source/de/anomic/kelondro/kelondroMergeIterator.java index 0b9f91c57..c2abb8f90 100644 --- a/source/de/anomic/kelondro/kelondroMergeIterator.java +++ b/source/de/anomic/kelondro/kelondroMergeIterator.java @@ -41,8 +41,8 @@ package de.anomic.kelondro; -import java.util.Iterator; import java.util.Comparator; +import java.util.Iterator; import java.util.Set; public class kelondroMergeIterator implements Iterator { diff --git a/source/de/anomic/plasma/parser/bzip/bzipParser.java b/source/de/anomic/plasma/parser/bzip/bzipParser.java index e2db67863..6475e8d4b 100644 --- a/source/de/anomic/plasma/parser/bzip/bzipParser.java +++ b/source/de/anomic/plasma/parser/bzip/bzipParser.java @@ -56,7 +56,6 @@ import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.parser.AbstractParser; import de.anomic.plasma.parser.Parser; import de.anomic.plasma.parser.ParserException; - public class bzipParser extends AbstractParser implements Parser { /** diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 2fdc05f82..0660636d0 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -63,8 +63,8 @@ import java.util.Properties; import de.anomic.http.httpc; import de.anomic.kelondro.kelondroTree; import de.anomic.server.serverCodings; -import de.anomic.server.logging.serverLog; import de.anomic.server.serverObjects; +import de.anomic.server.logging.serverLog; import de.anomic.tools.crypt; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacySeed; diff --git a/source/de/anomic/plasma/plasmaCrawlLoader.java b/source/de/anomic/plasma/plasmaCrawlLoader.java index 3c5f9a425..a531eae67 100644 --- a/source/de/anomic/plasma/plasmaCrawlLoader.java +++ b/source/de/anomic/plasma/plasmaCrawlLoader.java @@ -45,14 +45,12 @@ import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; -import java.util.HashSet; -import java.util.Hashtable; - -import org.apache.commons.pool.impl.GenericObjectPool; import de.anomic.server.serverSemaphore; import de.anomic.server.logging.serverLog; +import org.apache.commons.pool.impl.GenericObjectPool; + public final class plasmaCrawlLoader extends Thread { private final plasmaHTCache cacheManager; diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index d5423836b..740e2b6c0 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -47,7 +47,6 @@ import java.io.File; import java.io.IOException; import java.net.URL; import java.util.Date; -import java.util.Enumeration; import java.util.HashSet; import java.util.Iterator; diff --git a/source/de/anomic/plasma/plasmaCrawlProfile.java b/source/de/anomic/plasma/plasmaCrawlProfile.java index 07f29ae9d..30309afac 100644 --- a/source/de/anomic/plasma/plasmaCrawlProfile.java +++ b/source/de/anomic/plasma/plasmaCrawlProfile.java @@ -49,8 +49,8 @@ import java.util.Iterator; import java.util.Map; import de.anomic.kelondro.kelondroDyn; -import de.anomic.kelondro.kelondroMap; import de.anomic.kelondro.kelondroException; +import de.anomic.kelondro.kelondroMap; import de.anomic.server.serverCodings; public class plasmaCrawlProfile { diff --git a/source/de/anomic/plasma/plasmaCrawlWorker.java b/source/de/anomic/plasma/plasmaCrawlWorker.java index f2aa30f3f..914999d83 100644 --- a/source/de/anomic/plasma/plasmaCrawlWorker.java +++ b/source/de/anomic/plasma/plasmaCrawlWorker.java @@ -53,7 +53,6 @@ import java.util.logging.Logger; import de.anomic.http.httpHeader; import de.anomic.http.httpc; -import de.anomic.http.httpd; import de.anomic.http.httpdProxyHandler; import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverMiniLogFormatter; @@ -331,13 +330,7 @@ public final class plasmaCrawlWorker extends Thread { } // enQueue new entry with response header if (profile != null) { - if ((initiator == null) || (initiator.length() == 0)) { - // enqueued for proxy writings - cacheManager.stackProcess(htCache); - } else { - // direct processing for crawling - cacheManager.process(htCache); - } + cacheManager.push(htCache); } } catch (SocketException e) { // this may happen if the client suddenly closes its connection diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index eafb4df75..2e47a39ed 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -52,13 +52,12 @@ package de.anomic.plasma; import java.io.File; import java.io.FileNotFoundException; -import java.io.FileOutputStream; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.Date; -import java.util.Map; import java.util.LinkedList; +import java.util.Map; import java.util.TreeMap; import de.anomic.htmlFilter.htmlFilterContentScraper; @@ -76,9 +75,8 @@ public final class plasmaHTCache { private static final int stackLimit = 150; // if we exceed that limit, we do not check idle private static final long idleDelay = 2000; // 2 seconds no hits until we think that we idle - private static final long oneday = 1000 * 60 * 60 * 24; // milliseconds of a day + public static final long oneday = 1000 * 60 * 60 * 24; // milliseconds of a day - private final plasmaSwitchboard switchboard; private kelondroMap responseHeaderDB = null; private final LinkedList cacheStack; private final TreeMap cacheAge; // a - relation @@ -96,20 +94,21 @@ public final class plasmaHTCache { public static final int CACHE_STALE_RELOAD_BAD = 5; // this updates only the responseHeader, not the content public static final int CACHE_PASSING = 6; // does not touch cache, just passing - public plasmaHTCache(plasmaSwitchboard switchboard, int bufferkb) { - this.switchboard = switchboard; + public plasmaHTCache(File htCachePath, long maxCacheSize, int bufferkb) { + //this.switchboard = switchboard; this.log = new serverLog("HTCACHE"); + this.cachePath = htCachePath; + this.maxCacheSize = maxCacheSize; // set cache path - cachePath = new File(switchboard.getRootPath(),switchboard.getConfig("proxyCache","HTCACHE")); - if (!(cachePath.exists())) { + if (!(htCachePath.exists())) { // make the cache path - cachePath.mkdir(); + htCachePath.mkdir(); } - if (!(cachePath.isDirectory())) { + if (!(htCachePath.isDirectory())) { // if the cache does not exists or is a file and not a directory, panic - System.out.println("the cache path " + cachePath.toString() + " is not a directory or does not exists and cannot be created"); + System.out.println("the cache path " + htCachePath.toString() + " is not a directory or does not exists and cannot be created"); System.exit(0); } @@ -134,13 +133,87 @@ public final class plasmaHTCache { // init cache age and size management cacheAge = new TreeMap(); currCacheSize = 0; - maxCacheSize = 1024 * 1024 * Long.parseLong(switchboard.getConfig("proxyCacheSize", "2")); // this is megabyte + this.maxCacheSize = maxCacheSize; // start the cache startup thread // this will collect information about the current cache size and elements serverInstantThread.oneTimeJob(this, "cacheScan", log, 5000); } + public int size() { + return cacheStack.size(); + } + + public void push(Entry entry) { + cacheStack.add(entry); + } + + public Entry pop() { + return (Entry) cacheStack.removeFirst(); + } + + public void storeHeader(String urlHash, httpHeader responseHeader) throws IOException { + responseHeaderDB.set(urlHash, responseHeader); + } + + public boolean deleteFile(URL url) { + File file = getCachePath(url); + if (file.exists()) { + currCacheSize -= file.length(); + return file.delete(); + } else { + return false; + } + } + + public boolean writeFile(URL url, byte[] array) { + if (array == null) return false; + try { + File file = getCachePath(url); + if (file.exists()) { + currCacheSize -= file.length(); + file.delete(); + } + file.getParentFile().mkdirs(); + serverFileUtils.write(array, file); + currCacheSize += file.length(); + cacheAge.put(ageString(file.lastModified(), file), file); + } catch (FileNotFoundException e) { + // this is the case of a "(Not a directory)" error, which should be prohibited + // by the shallStoreCache() property. However, sometimes the error still occurs + // In this case do nothing. + log.logError("File storage failed (not a directory): " + e.getMessage()); + return false; + } catch (IOException e) { + log.logError("File storage failed (IO error): " + e.getMessage()); + return false; + } + cleanup(); + return true; + } + + private void cleanup() { + // clean up cache to have enough space for next entries + File f; + while (currCacheSize > maxCacheSize) { + f = (File) cacheAge.remove(cacheAge.firstKey()); + if (f.exists()) { + currCacheSize -= f.length(); + if (f.delete()) { + log.logInfo("DELETED OLD CACHE : " + f.toString()); + f = f.getParentFile(); + if ((f.exists()) && (f.isDirectory())) { + // check size of directory + if (f.list().length == 0) { + // the directory has no files in it; delete it also + if (f.delete()) log.logInfo("DELETED EMPTY DIRECTORY : " + f.toString()); + } + } + } + } + } + } + public void close() throws IOException { responseHeaderDB.close(); } @@ -172,8 +245,13 @@ public final class plasmaHTCache { cacheAge.put(ageString(d, f), f); } //System.out.println("%" + (String) cacheAge.firstKey() + "=" + cacheAge.get(cacheAge.firstKey())); - long ageHours = (System.currentTimeMillis() - - Long.parseLong(((String) cacheAge.firstKey()).substring(0, 16), 16)) / 3600000; + long ageHours = 0; + try { + ageHours = (System.currentTimeMillis() - + Long.parseLong(((String) cacheAge.firstKey()).substring(0, 16), 16)) / 3600000; + } catch (NumberFormatException e) { + e.printStackTrace(); + } log.logSystem("CACHE SCANNED, CONTAINS " + c + " FILES = " + currCacheSize/1048576 + "MB, OLDEST IS " + ((ageHours < 24) ? (ageHours + " HOURS") : ((ageHours / 24) + " DAYS")) + @@ -224,145 +302,7 @@ public final class plasmaHTCache { public boolean empty() { return (cacheStack.size() == 0); } - - synchronized public void stackProcess(Entry entry) throws IOException { - lastAcc = System.currentTimeMillis(); - if (full()) - process(entry); - else - cacheStack.add(entry); - } - - synchronized public void stackProcess(Entry entry, byte[] cacheArray) throws IOException { - lastAcc = System.currentTimeMillis(); - entry.cacheArray = cacheArray; - if (full()) - process(entry); - else - cacheStack.add(entry); - } - - public int size() { - return cacheStack.size(); - } - - synchronized public void process(Entry entry) throws IOException { - - if (entry == null) return; - - // store response header - if ((entry.status == CACHE_FILL) || - (entry.status == CACHE_STALE_RELOAD_GOOD) || - (entry.status == CACHE_STALE_RELOAD_BAD)) { - responseHeaderDB.set(entry.nomalizedURLHash, entry.responseHeader); - } - - // work off unwritten files and undone parsing - String storeError = null; - if (((entry.status == CACHE_FILL) || (entry.status == CACHE_STALE_RELOAD_GOOD)) && - ((storeError = entry.shallStoreCache()) == null)) { - - // write file if not written yet - if (entry.cacheArray != null) try { - if (entry.cacheFile.exists()) { - currCacheSize -= entry.cacheFile.length(); - entry.cacheFile.delete(); - } - entry.cacheFile.getParentFile().mkdirs(); - log.logInfo("WRITE FILE (" + entry.cacheArray.length + " bytes) " + entry.cacheFile); - serverFileUtils.write(entry.cacheArray, entry.cacheFile); - log.logDebug("AFTER WRITE cacheArray = " + entry.cacheFile + ": " + ((entry.cacheArray == null) ? "empty" : "full")); - //entry.cacheArray = null; - } catch (FileNotFoundException e) { - // this is the case of a "(Not a directory)" error, which should be prohibited - // by the shallStoreCache() property. However, sometimes the error still occurs - // In this case do nothing. - log.logError("File storage failed: " + e.getMessage()); - } - - // update statistics - currCacheSize += entry.cacheFile.length(); - cacheAge.put(ageString(entry.cacheFile.lastModified(), entry.cacheFile), entry.cacheFile); - - // enqueue in switchboard - switchboard.enQueue(entry); - } else if (entry.status == CACHE_PASSING) { - // even if the file should not be stored in the cache, it can be used to be indexed - if (storeError != null) log.logDebug("NOT STORED " + entry.cacheFile + ":" + storeError); - - // enqueue in switchboard - switchboard.enQueue(entry); - } - - // write log - - switch (entry.status) { - case CACHE_UNFILLED: - log.logInfo("CACHE UNFILLED: " + entry.cacheFile); break; - case CACHE_FILL: - log.logInfo("CACHE FILL: " + entry.cacheFile + - ((entry.cacheArray == null) ? "" : " (cacheArray is filled)") + - ((entry.scraper == null) ? "" : " (scraper is filled)")); - break; - case CACHE_HIT: - log.logInfo("CACHE HIT: " + entry.cacheFile); break; - case CACHE_STALE_NO_RELOAD: - log.logInfo("CACHE STALE, NO RELOAD: " + entry.cacheFile); break; - case CACHE_STALE_RELOAD_GOOD: - log.logInfo("CACHE STALE, NECESSARY RELOAD: " + entry.cacheFile); break; - case CACHE_STALE_RELOAD_BAD: - log.logInfo("CACHE STALE, SUPERFLUOUS RELOAD: " + entry.cacheFile); break; - case CACHE_PASSING: - log.logInfo("PASSING: " + entry.cacheFile); break; - default: - log.logInfo("CACHE STATE UNKNOWN: " + entry.cacheFile); break; - } - } - - public boolean job() { - if (empty()) return false; - try { - File f; - int workoff; - workoff = 1 + cacheStack.size() / 10; - // we want to work off always 10 % to prevent that we collaps - while ((workoff-- > 0) && (!(empty()))) { - process((Entry) cacheStack.removeFirst()); - } - - // loop until we are not idle or nothing more to do - while ((!empty()) && (idle())) { - // work off stack and store entries to file system - process((Entry) cacheStack.removeFirst()); - - // clean up cache to have enough space for next entries - while (currCacheSize > maxCacheSize) { - f = (File) cacheAge.remove(cacheAge.firstKey()); - if (f.exists()) { - currCacheSize -= f.length(); - if (f.delete()) { - log.logInfo("DELETED OLD CACHE : " + f.toString()); - f = f.getParentFile(); - if ((f.exists()) && (f.isDirectory())) { - // check size of directory - if (f.list().length == 0) { - // the directory has no files in it; delete it also - if (f.delete()) log.logInfo("DELETED EMPTY DIRECTORY : " + f.toString()); - } - } - } - } - } - } - } catch (IOException e) { - System.out.println("The proxy cache manager has died because of an IO-problem: " + e.getMessage()); - e.printStackTrace(System.out); - System.exit(-1); - } - return true; - } - public static boolean isPicture(httpHeader response) { Object ct = response.get(httpHeader.CONTENT_TYPE); if (ct == null) return false; @@ -803,184 +743,6 @@ public final class plasmaHTCache { return true; } - - public String shallIndexCacheForProxy() { - // decide upon header information if a specific file should be indexed - // this method returns null if the answer is 'YES'! - // if the answer is 'NO' (do not index), it returns a string with the reason - // to reject the crawling demand in clear text - - // check profile - if (!(profile.localIndexing())) return "Indexing_Not_Allowed"; - - // -CGI access in request - // CGI access makes the page very individual, and therefore not usable in caches - if ((isPOST(nomalizedURLString)) && (!(profile.crawlingQ()))) return "Dynamic_(POST)"; - if ((isCGI(nomalizedURLString)) && (!(profile.crawlingQ()))) return "Dynamic_(CGI)"; - - // -authorization cases in request - // we checked that in shallStoreCache - - // -ranges in request - // we checked that in shallStoreCache - - // a picture cannot be indexed - if (isPicture(responseHeader)) return "Media_Content_(Picture)"; - if (!(isText(responseHeader))) return "Media_Content_(not_text)"; - if (noIndexingURL(nomalizedURLString)) return "Media_Content_(forbidden)"; - - - // -if-modified-since in request - // if the page is fresh at the very moment we can index it - if ((requestHeader != null) && - (requestHeader.containsKey(httpHeader.IF_MODIFIED_SINCE)) && - (responseHeader.containsKey(httpHeader.LAST_MODIFIED))) { - // parse date - Date d1, d2; - d2 = responseHeader.lastModified(); if (d2 == null) d2 = new Date(); - d1 = requestHeader.ifModifiedSince(); if (d1 == null) d1 = new Date(); - // finally, we shall treat the cache as stale if the modification time is after the if-.. time - if (d2.after(d1)) { - //System.out.println("***not indexed because if-modified-since"); - return "Stale_(Last-Modified>Modified-Since)"; - } - } - - // -cookies in request - // unfortunately, we cannot index pages which have been requested with a cookie - // because the returned content may be special for the client - if ((requestHeader != null) && (requestHeader.containsKey(httpHeader.COOKIE))) { - //System.out.println("***not indexed because cookie"); - return "Dynamic_(Requested_With_Cookie)"; - } - - // -set-cookie in response - // the set-cookie from the server does not indicate that the content is special - // thus we do not care about it here for indexing - - // -pragma in cached response - if ((responseHeader.containsKey(httpHeader.PRAGMA)) && - (((String) responseHeader.get(httpHeader.PRAGMA)).toUpperCase().equals("NO-CACHE"))) return "Denied_(pragma_no_cache)"; - - // see for documentation also: - // http://www.web-caching.com/cacheability.html - - // calculate often needed values for freshness attributes - Date date = responseHeader.date(); - Date expires = responseHeader.expires(); - Date lastModified = responseHeader.lastModified(); - String cacheControl = (String) responseHeader.get(httpHeader.CACHE_CONTROL); - - // look for freshnes information - - // -expires in cached response - // the expires value gives us a very easy hint when the cache is stale - // sometimes, the expires date is set to the past to prevent that a page is cached - // we use that information to see if we should index it - if (expires != null) { - Date yesterday = new Date((new Date()).getTime() - oneday); - if (expires.before(yesterday)) return "Stale_(Expired)"; - } - - // -lastModified in cached response - // this information is too weak to use it to prevent indexing - // even if we can apply a TTL heuristic for cache usage - - // -cache-control in cached response - // the cache-control has many value options. - if (cacheControl != null) { - cacheControl = cacheControl.trim().toUpperCase(); - /* we have the following cases for cache-control: - "public" -- can be indexed - "private", "no-cache", "no-store" -- cannot be indexed - "max-age=" -- stale/fresh dependent on date - */ - if (cacheControl.startsWith("PUBLIC")) { - // ok, do nothing - } else if ((cacheControl.startsWith("PRIVATE")) || - (cacheControl.startsWith("NO-CACHE")) || - (cacheControl.startsWith("NO-STORE"))) { - // easy case - return "Stale_(denied_by_cache-control=" + cacheControl+ ")"; - } else if (cacheControl.startsWith("MAX-AGE=")) { - // we need also the load date - if (date == null) return "Stale_(no_date_given_in_response)"; - try { - long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live - if ((new Date()).getTime() - date.getTime() > ttl) { - //System.out.println("***not indexed because cache-control"); - return "Stale_(expired_by_cache-control)"; - } - } catch (Exception e) { - return "Error_(" + e.getMessage() + ")"; - } - } - } - - return null; - } - - - public String shallIndexCacheForCrawler() { - // decide upon header information if a specific file should be indexed - // this method returns null if the answer is 'YES'! - // if the answer is 'NO' (do not index), it returns a string with the reason - // to reject the crawling demand in clear text - - // check profile - if (!(profile.localIndexing())) return "Indexing_Not_Allowed"; - - // -CGI access in request - // CGI access makes the page very individual, and therefore not usable in caches - if ((isPOST(nomalizedURLString)) && (!(profile.crawlingQ()))) return "Dynamic_(POST)"; - if ((isCGI(nomalizedURLString)) && (!(profile.crawlingQ()))) return "Dynamic_(CGI)"; - - // -authorization cases in request - // we checked that in shallStoreCache - - // -ranges in request - // we checked that in shallStoreCache - - // a picture cannot be indexed - if (isPicture(responseHeader)) return "Media_Content_(Picture)"; - if (!(isText(responseHeader))) return "Media_Content_(not_text)"; - if (noIndexingURL(nomalizedURLString)) return "Media_Content_(forbidden)"; - - // -if-modified-since in request - // if the page is fresh at the very moment we can index it - // -> this does not apply for the crawler - - // -cookies in request - // unfortunately, we cannot index pages which have been requested with a cookie - // because the returned content may be special for the client - // -> this does not apply for a crawler - - // -set-cookie in response - // the set-cookie from the server does not indicate that the content is special - // thus we do not care about it here for indexing - // -> this does not apply for a crawler - - // -pragma in cached response - // -> in the crawler we ignore this - - // look for freshnes information - - // -expires in cached response - // the expires value gives us a very easy hint when the cache is stale - // sometimes, the expires date is set to the past to prevent that a page is cached - // we use that information to see if we should index it - // -> this does not apply for a crawler - - // -lastModified in cached response - // this information is too weak to use it to prevent indexing - // even if we can apply a TTL heuristic for cache usage - - // -cache-control in cached response - // the cache-control has many value options. - // -> in the crawler we ignore this - - return null; - } } diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 2ad248f2a..b6dadbf5f 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -68,17 +68,16 @@ import java.util.Map; import java.util.Properties; import java.util.Set; -import org.apache.commons.pool.KeyedPoolableObjectFactory; -import org.apache.commons.pool.impl.GenericKeyedObjectPool; -import org.apache.commons.pool.impl.GenericObjectPool; - import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterOutputStream; import de.anomic.plasma.parser.Parser; import de.anomic.plasma.parser.ParserException; import de.anomic.server.serverFileUtils; import de.anomic.server.logging.serverLog; -import de.anomic.yacy.yacySeedUploader; + +import org.apache.commons.pool.impl.GenericKeyedObjectPool; +import org.apache.commons.pool.KeyedPoolableObjectFactory; +import org.apache.commons.pool.impl.GenericObjectPool; public final class plasmaParser { @@ -264,15 +263,22 @@ public final class plasmaParser { } } - public static boolean supportedFileExtContains(String mediaExt) { + public static boolean supportedFileExt(URL url) { + String name = url.getFile(); + int p = name.lastIndexOf('.'); + if (p < 0) return true; // seams to be strange, but this is a directory entry or default file (html) + return supportedFileExtContains(name.substring(p + 1)); + } + + public static boolean supportedFileExtContains(String fileExt) { if (supportedFileExt == null) return false; synchronized(supportedFileExt) { - if (supportedFileExt.contains(mediaExt)) return true; + if (supportedFileExt.contains(fileExt)) return true; } synchronized (supportedRealtimeFileExt) { - return supportedRealtimeFileExt.contains(mediaExt); + return supportedRealtimeFileExt.contains(fileExt); } } diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 97fbe0a98..8f410c37d 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -42,15 +42,17 @@ package de.anomic.plasma; -import java.util.*; -import java.io.File; import java.io.IOException; import java.net.URL; -import de.anomic.htmlFilter.htmlFilterContentScraper; +import java.util.Enumeration; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +import de.anomic.http.httpHeader; import de.anomic.kelondro.kelondroMScoreCluster; -import de.anomic.server.serverFileUtils; import de.anomic.server.logging.serverLog; -import de.anomic.http.httpHeader; import de.anomic.yacy.yacySearch; public class plasmaSnippetCache { diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index d44ddcea1..18a403fda 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -114,7 +114,6 @@ import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; -import java.util.LinkedList; import java.util.Map; import java.util.Set; import java.util.TreeSet; @@ -130,7 +129,6 @@ import de.anomic.kelondro.kelondroTables; import de.anomic.server.serverAbstractSwitch; import de.anomic.server.serverCodings; import de.anomic.server.serverCore; -import de.anomic.server.serverDate; import de.anomic.server.serverInstantThread; import de.anomic.server.serverObjects; import de.anomic.server.serverSemaphore; @@ -149,6 +147,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // load slots public static int crawlSlots = 10; + public static int indexingSlots = 100; // couloured list management public static TreeSet blueList = null; @@ -164,7 +163,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public plasmaHTCache cacheManager; public plasmaSnippetCache snippetCache; public plasmaCrawlLoader cacheLoader; - public LinkedList queueStack = new LinkedList(); + public plasmaSwitchboardQueue sbQueue; public messageBoard messageDB; public wikiBoard wikiDB; public String remoteProxyHost; @@ -256,11 +255,16 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // start a cache manager log.logSystem("Starting HT Cache Manager"); - this.cacheManager = new plasmaHTCache(this, ramHTTP); + File htCachePath = new File(getRootPath(), getConfig("proxyCache","HTCACHE")); + long maxCacheSize = 1024 * 1024 * Long.parseLong(getConfig("proxyCacheSize", "2")); // this is megabyte + this.cacheManager = new plasmaHTCache(htCachePath, maxCacheSize, ramHTTP); // make parser log.logSystem("Starting Parser"); - this.parser = new plasmaParser(); + this.parser = new plasmaParser(); + + // initialize switchboard queue + sbQueue = new plasmaSwitchboardQueue(this.cacheManager, urlPool.loadedURL, new File(plasmaPath, "switchboardQueue0.stack"), 10, profiles); // define an extension-blacklist log.logSystem("Parser: Initializing Extension Mappings for Media/Parser"); @@ -347,7 +351,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser new serverInstantThread(this, "deQueue", "queueSize"), 10000 + (i * 1000)); } deployThread("70_cachemanager", "Proxy Cache Enqueue", "job takes new proxy files from RAM stack, stores them, and hands over to the Indexing Stack", - new serverInstantThread(cacheManager, "job", "size"), 10000); + new serverInstantThread(this, "htEntryStoreJob", "htEntrySize"), 10000); deployThread("62_remotetriggeredcrawl", "Remote Crawl Job", "thread that performes a single crawl/indexing step triggered by a remote peer", new serverInstantThread(this, "remoteTriggeredCrawlJob", "remoteTriggeredCrawlJobSize"), 30000); deployThread("61_globalcrawltrigger", "Global Crawl Trigger", "thread that triggeres remote peers for crawling", @@ -423,7 +427,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } catch (IOException e) {} } private void cleanProfiles() { - if ((queueStack.size() > 0) || (cacheLoader.size() > 0) || (urlPool.noticeURL.stackSize() > 0)) return; + if ((sbQueue.size() > 0) || (cacheLoader.size() > 0) || (urlPool.noticeURL.stackSize() > 0)) return; Iterator i = profiles.profiles(true); plasmaCrawlProfile.entry entry; try { @@ -440,6 +444,100 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser return cacheManager; } + + + synchronized public void htEntryStoreEnqueued(plasmaHTCache.Entry entry) throws IOException { + if (cacheManager.full()) + htEntryStoreProcess(entry); + else + cacheManager.push(entry); + } + + + + synchronized public boolean htEntryStoreProcess(plasmaHTCache.Entry entry) throws IOException { + + if (entry == null) return false; + + // store response header + if ((entry.status == plasmaHTCache.CACHE_FILL) || + (entry.status == plasmaHTCache.CACHE_STALE_RELOAD_GOOD) || + (entry.status == plasmaHTCache.CACHE_STALE_RELOAD_BAD)) { + cacheManager.storeHeader(entry.nomalizedURLHash, entry.responseHeader); + } + + // work off unwritten files and undone parsing + String storeError = null; + if (((entry.status == plasmaHTCache.CACHE_FILL) || (entry.status == plasmaHTCache.CACHE_STALE_RELOAD_GOOD)) && + ((storeError = entry.shallStoreCache()) == null)) { + + // write file if not written yet + if (entry.cacheArray != null) { + cacheManager.writeFile(entry.url, entry.cacheArray); + log.logInfo("WRITE FILE (" + entry.cacheArray.length + " bytes) " + entry.cacheFile); + } + // enqueue for further crawling + enQueue(sbQueue.newEntry(entry.url, plasmaURL.urlHash(entry.referrerURL()), + entry.requestHeader.ifModifiedSince(), entry.requestHeader.containsKey(httpHeader.COOKIE), + entry.initiator(), entry.depth, entry.profile.handle(), + (entry.scraper == null) ? 0 : entry.scraper.getAnchors().size(), + (entry.scraper == null) ? 0 : entry.scraper.getImages().size(), + (entry.scraper == null) ? "" : entry.scraper.getHeadline() + )); + } else if (entry.status == plasmaHTCache.CACHE_PASSING) { + // even if the file should not be stored in the cache, it can be used to be indexed + if (storeError != null) log.logDebug("NOT STORED " + entry.cacheFile + ":" + storeError); + + // enqueue for further crawling + enQueue(sbQueue.newEntry(entry.url, plasmaURL.urlHash(entry.referrerURL()), + entry.requestHeader.ifModifiedSince(), entry.requestHeader.containsKey(httpHeader.COOKIE), + entry.initiator(), entry.depth, entry.profile.handle(), + (entry.scraper == null) ? 0 : entry.scraper.getAnchors().size(), + (entry.scraper == null) ? 0 : entry.scraper.getImages().size(), + (entry.scraper == null) ? "" : entry.scraper.getHeadline() + )); + } + + // write log + + switch (entry.status) { + case plasmaHTCache.CACHE_UNFILLED: + log.logInfo("CACHE UNFILLED: " + entry.cacheFile); break; + case plasmaHTCache.CACHE_FILL: + log.logInfo("CACHE FILL: " + entry.cacheFile + + ((entry.cacheArray == null) ? "" : " (cacheArray is filled)") + + ((entry.scraper == null) ? "" : " (scraper is filled)")); + break; + case plasmaHTCache.CACHE_HIT: + log.logInfo("CACHE HIT: " + entry.cacheFile); break; + case plasmaHTCache.CACHE_STALE_NO_RELOAD: + log.logInfo("CACHE STALE, NO RELOAD: " + entry.cacheFile); break; + case plasmaHTCache.CACHE_STALE_RELOAD_GOOD: + log.logInfo("CACHE STALE, NECESSARY RELOAD: " + entry.cacheFile); break; + case plasmaHTCache.CACHE_STALE_RELOAD_BAD: + log.logInfo("CACHE STALE, SUPERFLUOUS RELOAD: " + entry.cacheFile); break; + case plasmaHTCache.CACHE_PASSING: + log.logInfo("PASSING: " + entry.cacheFile); break; + default: + log.logInfo("CACHE STATE UNKNOWN: " + entry.cacheFile); break; + } + return true; + } + + + public boolean htEntryStoreJob() { + if (cacheManager.empty()) return false; + try { + return htEntryStoreProcess(cacheManager.pop()); + } catch (IOException e) { + return false; + } + } + + public int htEntrySize() { + return cacheManager.size(); + } + private static TreeSet loadList(File file) { TreeSet list = new TreeSet(kelondroMSetTools.fastStringComparator); if (!(file.exists())) return list; @@ -487,7 +585,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser */ public int queueSize() { - return queueStack.size(); + return sbQueue.size(); //return processStack.size() + cacheLoader.size() + noticeURL.stackSize(); } @@ -502,16 +600,24 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } public void enQueue(Object job) { - plasmaHTCache.Entry entry = (plasmaHTCache.Entry) job; - queueStack.addLast(entry); + if (!(job instanceof plasmaSwitchboardQueue.Entry)) { + System.out.println("internal error at plasmaSwitchboard.enQueue: wrong job type"); + System.exit(0); + } + try { + sbQueue.push((plasmaSwitchboardQueue.Entry) job); + } catch (IOException e) { + log.logError("IOError in plasmaSwitchboard.enQueue: " + e.getMessage()); + e.printStackTrace(); + } } public boolean deQueue() { // work off fresh entries from the proxy or from the crawler - plasmaHTCache.Entry nextentry; - synchronized (queueStack) { - if (queueStack.size() == 0) { + plasmaSwitchboardQueue.Entry nextentry; + synchronized (sbQueue) { + if (sbQueue.size() == 0) { //log.logDebug("DEQUEUE: queue is empty"); return false; // nothing to do } @@ -521,12 +627,18 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // do one processing step log.logDebug("DEQUEUE: cacheManager=" + ((cacheManager.idle()) ? "idle" : "busy") + - ", queueStack=" + queueStack.size() + + ", sbQueueSize=" + sbQueue.size() + ", coreStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", limitStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", overhangStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", remoteStackSize=" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE)); - nextentry = (plasmaHTCache.Entry) queueStack.removeFirst(); + try { + nextentry = sbQueue.pop(); + } catch (IOException e) { + log.logError("IOError in plasmaSwitchboard.deQueue: " + e.getMessage()); + e.printStackTrace(); + return false; + } } processResourceStack(nextentry); return true; @@ -601,13 +713,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser //log.logDebug("CoreCrawl: queue is empty"); return false; } - if (queueStack.size() >= crawlSlots) { - log.logDebug("CoreCrawl: too many processes in queue, dismissed (" + - "queueStack=" + queueStack.size() + ")"); + if (sbQueue.size() >= indexingSlots) { + log.logDebug("CoreCrawl: too many processes in indexing queue, dismissed (" + + "sbQueueSize=" + sbQueue.size() + ")"); return false; } if (cacheLoader.size() >= crawlSlots) { - log.logDebug("CoreCrawl: too many loader in queue, dismissed (" + + log.logDebug("CoreCrawl: too many processes in loader queue, dismissed (" + "cacheLoader=" + cacheLoader.size() + ")"); return false; } @@ -688,7 +800,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false"))); boolean tryRemote = - ((urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) != 0) || (queueStack.size() != 0)) /* should do ourself */ && + ((urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) != 0) || (sbQueue.size() != 0)) /* should do ourself */ && (profile.remoteIndexing()) /* granted */ && (urlEntry.initiator() != null) && (!(urlEntry.initiator().equals(plasmaURL.dummyHash))) /* not proxy */ && ((yacyCore.seedDB.mySeed.isSenior()) || @@ -700,9 +812,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // alternatively do a local crawl - if (queueStack.size() >= crawlSlots) { + if (sbQueue.size() >= crawlSlots) { log.logDebug("LimitCrawl: too many processes in queue, dismissed (" + - "queueStack=" + queueStack.size() + ")"); + "sbQueueSize=" + sbQueue.size() + ")"); return false; } if (cacheLoader.size() >= crawlSlots) { @@ -776,8 +888,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser processLocalCrawling(urlEntry, profile, stats); return true; } - - private void processResourceStack(plasmaHTCache.Entry entry) { + + private void processResourceStack(plasmaSwitchboardQueue.Entry entry) { // work off one stack entry with a fresh resource (scraped web page) try { // we must distinguish the following cases: resource-load was initiated by @@ -802,39 +914,43 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser processCase = 6; } - log.logDebug("processResourceStack processCase=" + processCase + ", depth=" + entry.depth + ", maxDepth=" + entry.profile.generalDepth() + ", filter=" + entry.profile.generalFilter() + ", initiatorHash=" + initiatorHash + ", status=" + entry.status + ", source=" + ((entry.cacheArray == null) ? "scraper" : "byte[]") + ", url=" + entry.nomalizedURLString); // DEBUG + log.logDebug("processResourceStack processCase=" + processCase + + ", depth=" + entry.depth() + + ", maxDepth=" + ((entry.profile() == null) ? "null" : "" + entry.profile().generalDepth()) + + ", filter=" + ((entry.profile() == null) ? "null" : "" + entry.profile().generalFilter()) + + ", initiatorHash=" + initiatorHash + + ", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) + + ", url=" + entry.url()); // DEBUG // parse content plasmaParserDocument document = null; - if (plasmaParser.supportedMimeTypesContains(entry.responseHeader.mime())) { - if (entry.scraper != null) { - log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is pre-parsed by scraper"); - document = parser.transformScraper(entry.url, entry.responseHeader.mime(), entry.scraper); - } else if (entry.cacheArray != null) { - log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is not parsed yet, parsing now from cacheArray"); - document = parser.parseSource(entry.url, entry.responseHeader.mime(), entry.cacheArray); + if ((plasmaParser.supportedFileExt(entry.url())) || + ((entry.responseHeader() != null) && + (plasmaParser.supportedMimeTypesContains(entry.responseHeader().mime())))) { + if (entry.cacheFile().exists()) { + log.logDebug("(Parser) '" + entry.normalizedURLString() + "' is not parsed yet, parsing now from File"); + document = parser.parseSource(entry.url(), (entry.responseHeader() == null) ? null : entry.responseHeader().mime(), entry.cacheFile()); } else { - if (entry.cacheFile.exists()) { - log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is not parsed yet, parsing now from File"); - document = parser.parseSource(entry.url, entry.responseHeader.mime(), entry.cacheFile); - } else { - log.logDebug("(Parser) '" + entry.nomalizedURLString + "' cannot be parsed, no resource available"); - return; - } + log.logDebug("(Parser) '" + entry.normalizedURLString() + "' cannot be parsed, no resource available"); + return; } if (document == null) { - log.logError("(Parser) '" + entry.nomalizedURLString + "' parse failure"); + log.logError("(Parser) '" + entry.normalizedURLString() + "' parse failure"); return; } } else { - log.logDebug("(Parser) '" + entry.nomalizedURLString + "'. Unsupported mimeType '" + entry.responseHeader.mime() + "'."); + log.logDebug("(Parser) '" + entry.normalizedURLString() + "'. Unsupported mimeType '" + ((entry.responseHeader() == null) ? null : entry.responseHeader().mime()) + "'."); return; } + Date loadDate = entry.responseHeader().lastModified(); + if (loadDate == null) loadDate = entry.responseHeader().date(); + if (loadDate == null) loadDate = new Date(); + // put anchors on crawl stack if (((processCase == 4) || (processCase == 5)) && - (entry.depth < entry.profile.generalDepth())) { + ((entry.profile() == null) || (entry.depth() < entry.profile().generalDepth()))) { Map hl = document.getHyperlinks(); Iterator i = hl.entrySet().iterator(); String nexturlstring; @@ -844,15 +960,16 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser while (i.hasNext()) { e = (Map.Entry) i.next(); nexturlstring = (String) e.getKey(); - rejectReason = stackCrawl(nexturlstring, entry.nomalizedURLString, initiatorHash, (String) e.getValue(), entry.lastModified, entry.depth + 1, entry.profile); + + rejectReason = stackCrawl(nexturlstring, entry.normalizedURLString(), initiatorHash, (String) e.getValue(), loadDate, entry.depth() + 1, entry.profile()); if (rejectReason == null) { c++; } else { - urlPool.errorURL.newEntry(new URL(nexturlstring), entry.nomalizedURLString, entry.initiator(), yacyCore.seedDB.mySeed.hash, + urlPool.errorURL.newEntry(new URL(nexturlstring), entry.normalizedURLString(), entry.initiator(), yacyCore.seedDB.mySeed.hash, (String) e.getValue(), rejectReason, new bitfield(plasmaURL.urlFlagLength), false); } } - log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.url.toString() + + log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.url().toString() + ", NEW CRAWL STACK SIZE IS " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE)); } @@ -870,51 +987,56 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } if (noIndexReason == null) { // strip out words - log.logDebug("(Profile) Condensing for '" + entry.nomalizedURLString + "'"); + log.logDebug("(Profile) Condensing for '" + entry.normalizedURLString() + "'"); plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(document.getText())); //log.logInfo("INDEXING HEADLINE:" + descr); try { - log.logDebug("(Profile) Create LURL-Entry for '" + entry.nomalizedURLString + "'"); + log.logDebug("(Profile) Create LURL-Entry for '" + entry.normalizedURLString() + "', " + + "responseHeader=" + entry.responseHeader().toString()); + Date lastModified = entry.responseHeader().lastModified(); + if (lastModified == null) lastModified = entry.responseHeader().date(); + if (lastModified == null) lastModified = new Date(); plasmaCrawlLURL.entry newEntry = urlPool.loadedURL.newEntry( - entry.url, descr, entry.lastModified, new Date(), + entry.url(), descr, lastModified, new Date(), initiatorHash, yacyCore.seedDB.mySeed.hash, referrerHash, 0, true, Integer.parseInt(condenser.getAnalysis().getProperty("INFORMATION_VALUE","0"), 16), - entry.language, entry.doctype, + plasmaWordIndexEntry.language(entry.url()), + plasmaWordIndexEntry.docType(entry.responseHeader().mime()), entry.size(), (int) Long.parseLong(condenser.getAnalysis().getProperty("NUMB_WORDS","0"), 16), processCase ); String urlHash = newEntry.hash(); - log.logDebug("(Profile) Remove NURL for '" + entry.nomalizedURLString + "'"); + log.logDebug("(Profile) Remove NURL for '" + entry.normalizedURLString() + "'"); urlPool.noticeURL.remove(urlHash); // worked-off if (((processCase == 4) || (processCase == 5) || (processCase == 6)) && - (entry.profile.localIndexing())) { + (entry.profile().localIndexing())) { // remove stopwords - log.logDebug("(Profile) Exclude Stopwords for '" + entry.nomalizedURLString + "'"); - log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + entry.url); + log.logDebug("(Profile) Exclude Stopwords for '" + entry.normalizedURLString() + "'"); + log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + entry.url()); //System.out.println("DEBUG: words left to be indexed: " + condenser.getWords()); // do indexing - log.logDebug("(Profile) Create Index for '" + entry.nomalizedURLString + "'"); - int words = searchManager.addPageIndex(entry.url, urlHash, entry.lastModified, condenser, entry.language, entry.doctype); - log.logInfo("Indexed " + words + " words in URL " + entry.url + " (" + descr + ")"); + log.logDebug("(Profile) Create Index for '" + entry.normalizedURLString() + "'"); + int words = searchManager.addPageIndex(entry.url(), urlHash, loadDate, condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(entry.responseHeader().mime())); + log.logInfo("Indexed " + words + " words in URL " + entry.url() + " (" + descr + ")"); // if this was performed for a remote crawl request, notify requester if ((processCase == 6) && (initiator != null)) { - log.logInfo("Sending crawl receipt for '" + entry.nomalizedURLString + "' to " + initiator.getName()); + log.logInfo("Sending crawl receipt for '" + entry.normalizedURLString() + "' to " + initiator.getName()); yacyClient.crawlReceipt(initiator, "crawl", "fill", "indexed", newEntry, ""); } } else { - log.logDebug("Resource '" + entry.nomalizedURLString + "' not indexed (indexing is off)"); + log.logDebug("Resource '" + entry.normalizedURLString() + "' not indexed (indexing is off)"); } } catch (Exception ee) { - log.logError("Could not index URL " + entry.url + ": " + ee.getMessage()); + log.logError("Could not index URL " + entry.url() + ": " + ee.getMessage()); ee.printStackTrace(); if ((processCase == 6) && (initiator != null)) { yacyClient.crawlReceipt(initiator, "crawl", "exception", ee.getMessage(), null, ""); @@ -922,8 +1044,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } } else { - log.logInfo("Not indexed any word in URL " + entry.url + "; cause: " + noIndexReason); - urlPool.errorURL.newEntry(entry.url, referrerHash, + log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason); + urlPool.errorURL.newEntry(entry.url(), referrerHash, ((entry.proxy()) ? plasmaURL.dummyHash : entry.initiator()), yacyCore.seedDB.mySeed.hash, descr, noIndexReason, new bitfield(plasmaURL.urlFlagLength), true); @@ -1464,7 +1586,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public String toString() { // it is possible to use this method in the cgi pages. // actually it is used there for testing purpose - return "PROPS: " + super.toString() + "; QUEUE: " + queueStack.toString(); + return "PROPS: " + super.toString() + "; QUEUE: " + sbQueue.toString(); } // method for index deletion @@ -1536,7 +1658,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser long starttime = System.currentTimeMillis(); try { if ( - (queueStack.size() == 0) && + (sbQueue.size() == 0) && (cacheLoader.size() == 0) && (urlPool.noticeURL.stackSize() == 0) && (getConfig("allowDistributeIndex", "false").equals("true")) && diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 191af0a9a..222160d52 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -49,13 +49,12 @@ import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Comparator; -import java.util.HashSet; import java.util.Iterator; import java.util.TreeSet; import de.anomic.kelondro.kelondroMSetTools; -import de.anomic.yacy.yacySeedDB; import de.anomic.server.logging.serverLog; +import de.anomic.yacy.yacySeedDB; public final class plasmaWordIndex { diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortment.java b/source/de/anomic/plasma/plasmaWordIndexAssortment.java index f24728203..5ada591f6 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortment.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortment.java @@ -52,10 +52,13 @@ package de.anomic.plasma; -import java.io.*; -import java.util.*; -import java.lang.RuntimeException; -import de.anomic.kelondro.*; +import java.io.File; +import java.io.IOException; +import java.util.Iterator; + +import de.anomic.kelondro.kelondroException; +import de.anomic.kelondro.kelondroRecords; +import de.anomic.kelondro.kelondroTree; import de.anomic.server.logging.serverLog; public final class plasmaWordIndexAssortment { diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java index 1698b7972..adb1fa25c 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java @@ -47,8 +47,10 @@ package de.anomic.plasma; import java.io.File; -import java.util.*; -import de.anomic.kelondro.*; +import java.util.HashSet; +import java.util.Iterator; + +import de.anomic.kelondro.kelondroMergeIterator; import de.anomic.server.logging.serverLog; public final class plasmaWordIndexAssortmentCluster { diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java index 2a655d241..df8112c9e 100644 --- a/source/de/anomic/plasma/plasmaWordIndexCache.java +++ b/source/de/anomic/plasma/plasmaWordIndexCache.java @@ -42,10 +42,17 @@ package de.anomic.plasma; -import java.io.*; -import java.util.*; -import java.lang.RuntimeException; -import de.anomic.kelondro.*; +import java.io.File; +import java.io.IOException; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; + +import de.anomic.kelondro.kelondroException; +import de.anomic.kelondro.kelondroMScoreCluster; +import de.anomic.kelondro.kelondroMergeIterator; +import de.anomic.kelondro.kelondroRecords; +import de.anomic.kelondro.kelondroStack; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacySeedDB; diff --git a/source/de/anomic/plasma/plasmaWordIndexClassicCacheMigration.java b/source/de/anomic/plasma/plasmaWordIndexClassicCacheMigration.java index e6b50f3e7..fc9545fd2 100644 --- a/source/de/anomic/plasma/plasmaWordIndexClassicCacheMigration.java +++ b/source/de/anomic/plasma/plasmaWordIndexClassicCacheMigration.java @@ -60,11 +60,14 @@ package de.anomic.plasma; -import java.io.*; -import java.util.*; -import de.anomic.server.*; +import java.io.File; +import java.io.IOException; +import java.util.HashSet; +import java.util.Iterator; + +import de.anomic.kelondro.kelondroException; +import de.anomic.kelondro.kelondroTree; import de.anomic.server.logging.serverLog; -import de.anomic.kelondro.*; public class plasmaWordIndexClassicCacheMigration { diff --git a/source/de/anomic/plasma/plasmaWordIndexClassicDB.java b/source/de/anomic/plasma/plasmaWordIndexClassicDB.java index 6de647743..dad5f2c2f 100644 --- a/source/de/anomic/plasma/plasmaWordIndexClassicDB.java +++ b/source/de/anomic/plasma/plasmaWordIndexClassicDB.java @@ -44,9 +44,12 @@ package de.anomic.plasma; import java.io.File; import java.io.IOException; -import java.util.*; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.Iterator; +import java.util.TreeSet; -import de.anomic.kelondro.*; +import de.anomic.kelondro.kelondroMSetTools; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacySeedDB; diff --git a/source/de/anomic/plasma/plasmaWordIndexEntry.java b/source/de/anomic/plasma/plasmaWordIndexEntry.java index 2df9321d1..2fc3e4e89 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntry.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntry.java @@ -48,9 +48,9 @@ package de.anomic.plasma; import java.net.URL; import java.util.Properties; +import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.server.serverCodings; import de.anomic.yacy.yacySeedDB; -import de.anomic.htmlFilter.htmlFilterContentScraper; public class plasmaWordIndexEntry { diff --git a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java index d4bd09701..a042c0b06 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java @@ -54,6 +54,7 @@ package de.anomic.plasma; import java.util.HashMap; import java.util.Iterator; + import de.anomic.server.serverCodings; public class plasmaWordIndexEntryContainer implements Comparable { diff --git a/source/de/anomic/plasma/plasmaWordIndexInterface.java b/source/de/anomic/plasma/plasmaWordIndexInterface.java index ef583b948..0187bb877 100644 --- a/source/de/anomic/plasma/plasmaWordIndexInterface.java +++ b/source/de/anomic/plasma/plasmaWordIndexInterface.java @@ -42,7 +42,7 @@ package de.anomic.plasma; -import java.util.*; +import java.util.Iterator; public interface plasmaWordIndexInterface { diff --git a/source/de/anomic/server/logging/serverLog.java b/source/de/anomic/server/logging/serverLog.java index ff24e787b..f2eb86916 100644 --- a/source/de/anomic/server/logging/serverLog.java +++ b/source/de/anomic/server/logging/serverLog.java @@ -44,8 +44,6 @@ import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; -import java.io.PrintStream; -import java.io.PrintWriter; import java.util.logging.Level; import java.util.logging.LogManager; import java.util.logging.Logger; diff --git a/source/de/anomic/server/logging/serverSimpleLogFormatter.java b/source/de/anomic/server/logging/serverSimpleLogFormatter.java index 3ce03268b..73c2e08ec 100644 --- a/source/de/anomic/server/logging/serverSimpleLogFormatter.java +++ b/source/de/anomic/server/logging/serverSimpleLogFormatter.java @@ -5,15 +5,9 @@ import java.io.StringWriter; import java.text.FieldPosition; import java.text.SimpleDateFormat; import java.util.Date; -import java.util.logging.ConsoleHandler; -import java.util.logging.Handler; -import java.util.logging.Level; import java.util.logging.LogRecord; -import java.util.logging.Logger; import java.util.logging.SimpleFormatter; -import org.apache.commons.collections.map.CaseInsensitiveMap; - public class serverSimpleLogFormatter extends SimpleFormatter { diff --git a/source/de/anomic/server/serverAbstractSwitch.java b/source/de/anomic/server/serverAbstractSwitch.java index 03e563076..2f1922ceb 100644 --- a/source/de/anomic/server/serverAbstractSwitch.java +++ b/source/de/anomic/server/serverAbstractSwitch.java @@ -48,11 +48,10 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; import java.net.InetAddress; -import java.util.Enumeration; -import java.util.Hashtable; -import java.util.Map; import java.util.HashMap; +import java.util.Hashtable; import java.util.Iterator; +import java.util.Map; import java.util.Properties; import java.util.TreeMap; diff --git a/source/de/anomic/server/serverSwitch.java b/source/de/anomic/server/serverSwitch.java index 24eabff19..3d2bf9e29 100644 --- a/source/de/anomic/server/serverSwitch.java +++ b/source/de/anomic/server/serverSwitch.java @@ -47,7 +47,6 @@ package de.anomic.server; import java.net.InetAddress; -import java.util.Enumeration; import java.util.Iterator; import de.anomic.server.logging.serverLog; diff --git a/source/de/anomic/yacy/seedUpload/yacySeedUploadFile.java b/source/de/anomic/yacy/seedUpload/yacySeedUploadFile.java index a128ba26f..57c7bce8c 100644 --- a/source/de/anomic/yacy/seedUpload/yacySeedUploadFile.java +++ b/source/de/anomic/yacy/seedUpload/yacySeedUploadFile.java @@ -1,7 +1,6 @@ package de.anomic.yacy.seedUpload; import java.io.File; -import java.net.URL; import de.anomic.server.serverFileUtils; import de.anomic.server.serverSwitch; diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index b14725fc0..d01783503 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -52,11 +52,11 @@ import java.util.Vector; import de.anomic.http.httpc; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSearch; +import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndexEntity; import de.anomic.plasma.plasmaWordIndexEntry; import de.anomic.plasma.plasmaWordIndexEntryContainer; -import de.anomic.plasma.plasmaSnippetCache; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.tools.crypt; diff --git a/source/de/anomic/yacy/yacyCore.java b/source/de/anomic/yacy/yacyCore.java index d54d4dcb8..939b5bbd5 100644 --- a/source/de/anomic/yacy/yacyCore.java +++ b/source/de/anomic/yacy/yacyCore.java @@ -62,7 +62,6 @@ import java.net.MalformedURLException; import java.net.URI; import java.net.URL; import java.text.SimpleDateFormat; -import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.GregorianCalendar; @@ -70,15 +69,12 @@ import java.util.Hashtable; import java.util.LinkedList; import java.util.List; import java.util.TimeZone; -import java.util.Vector; -import de.anomic.http.httpc; import de.anomic.net.natLib; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.server.logging.serverLog; -import de.anomic.server.serverCore; import de.anomic.server.serverSemaphore; import de.anomic.server.serverSwitch; +import de.anomic.server.logging.serverLog; public class yacyCore { diff --git a/source/de/anomic/yacy/yacyPeerActions.java b/source/de/anomic/yacy/yacyPeerActions.java index c0ec8f926..fb4e9474c 100644 --- a/source/de/anomic/yacy/yacyPeerActions.java +++ b/source/de/anomic/yacy/yacyPeerActions.java @@ -266,6 +266,8 @@ public class yacyPeerActions { } } catch (java.text.ParseException e) { ctime = yacyCore.universalTime(); + } catch (java.lang.NumberFormatException e) { + ctime = yacyCore.universalTime(); } if (Math.abs(yacyCore.universalTime() - ctime) > 3600000) { diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index 6c01af540..e4e41f4e9 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -45,9 +45,9 @@ import java.util.Iterator; import java.util.Set; import de.anomic.kelondro.kelondroMScoreCluster; -import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSearch; +import de.anomic.plasma.plasmaSnippetCache; public class yacySearch extends Thread { diff --git a/source/de/anomic/yacy/yacySeedDB.java b/source/de/anomic/yacy/yacySeedDB.java index 16360c72b..0c4b5eb3c 100644 --- a/source/de/anomic/yacy/yacySeedDB.java +++ b/source/de/anomic/yacy/yacySeedDB.java @@ -58,7 +58,6 @@ import de.anomic.kelondro.kelondroDyn; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMap; -import de.anomic.net.ftpc; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverCore; import de.anomic.server.serverSwitch; diff --git a/source/yacy.java b/source/yacy.java index f3ddd6605..6dd089b7f 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -64,9 +64,6 @@ */ -import java.io.BufferedReader; -import java.io.InputStreamReader; -import java.io.FileInputStream; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.ByteArrayOutputStream; @@ -78,22 +75,23 @@ import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.net.URL; +import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; -import java.util.Enumeration; import java.util.Properties; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; +import de.anomic.data.translator; import de.anomic.http.httpHeader; import de.anomic.http.httpc; import de.anomic.http.httpd; import de.anomic.http.httpdFileHandler; import de.anomic.http.httpdProxyHandler; -import de.anomic.kelondro.kelondroTree; import de.anomic.kelondro.kelondroMScoreCluster; +import de.anomic.kelondro.kelondroTree; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaURL; import de.anomic.plasma.plasmaWordIndex; @@ -107,7 +105,6 @@ import de.anomic.server.serverSystem; import de.anomic.server.logging.serverLog; import de.anomic.tools.enumerateFiles; import de.anomic.yacy.yacyCore; -import de.anomic.data.translator; public final class yacy {