From 5d06ded005e758b716fcf0354f3d52b2a4fe02bf Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 17 Jun 2005 01:26:51 +0000 Subject: [PATCH] enhanced html parser speed git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@290 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Network.java | 5 +- htroot/Status.java | 28 +- .../htmlFilter/htmlFilterAbstractScraper.java | 561 +++++++++--------- .../htmlFilter/htmlFilterContentScraper.java | 18 +- .../htmlFilter/htmlFilterOutputStream.java | 17 +- source/de/anomic/http/httpc.java | 2 +- source/de/anomic/http/httpd.java | 2 +- .../de/anomic/plasma/plasmaSwitchboard.java | 64 +- source/de/anomic/server/serverByteBuffer.java | 21 +- source/de/anomic/server/serverDate.java | 24 + 10 files changed, 387 insertions(+), 355 deletions(-) diff --git a/htroot/Network.java b/htroot/Network.java index 1e3f7861b..b95364419 100644 --- a/htroot/Network.java +++ b/htroot/Network.java @@ -48,6 +48,7 @@ import java.util.Enumeration; import de.anomic.http.httpHeader; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; +import de.anomic.server.serverDate; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacySeed; @@ -101,7 +102,7 @@ public class Network { accActWords += words; } prop.put("table_my-version", seed.get("Version", "-")); - prop.put("table_my-uptime", Status.intervalToString(seed.get("Uptime", "-"))); + prop.put("table_my-uptime", serverDate.intervalToString(60000 * Long.parseLong(seed.get("Uptime", "")))); prop.put("table_my-links", groupDigits(links)); prop.put("table_my-words", groupDigits(words)); prop.put("table_my-acceptcrawl", "" + (seed.getFlagAcceptRemoteCrawl() ? 1 : 0) ); @@ -210,7 +211,7 @@ public class Network { prop.put("table_list_"+conCount+"_version", seed.get("Version", "-")); prop.put("table_list_"+conCount+"_contact", (seed.getFlagDirectConnect() ? 1 : 0) ); prop.put("table_list_"+conCount+"_lastSeen", lastSeen(seed.get("LastSeen", "-")) ); - prop.put("table_list_"+conCount+"_uptime", Status.intervalToString(seed.get("Uptime", "-"))); + prop.put("table_list_"+conCount+"_uptime", serverDate.intervalToString(60000 * Long.parseLong(seed.get("Uptime", "0")))); prop.put("table_list_"+conCount+"_links", groupDigits(links)); prop.put("table_list_"+conCount+"_words", groupDigits(words)); prop.put("table_list_"+conCount+"_acceptcrawl", (seed.getFlagAcceptRemoteCrawl() ? 1 : 0) ); diff --git a/htroot/Status.java b/htroot/Status.java index c654fd340..51c36ecd0 100644 --- a/htroot/Status.java +++ b/htroot/Status.java @@ -51,6 +51,7 @@ import de.anomic.http.httpdByteCountOutputStream; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; +import de.anomic.server.serverDate; import de.anomic.yacy.yacyCore; public class Status { @@ -112,7 +113,7 @@ public class Status { prop.put("peerStatistics", 0);//unknown } else { prop.put("peerStatistics", 1); - prop.put("peerStatistics_uptime", intervalToString(yacyCore.seedDB.mySeed.get("Uptime", "unknown"))); + prop.put("peerStatistics_uptime", serverDate.intervalToString(60000 * Long.parseLong(yacyCore.seedDB.mySeed.get("Uptime", "0")))); prop.put("peerStatistics_links", yacyCore.seedDB.mySeed.get("LCount", "unknown")); prop.put("peerStatistics_words", yacyCore.seedDB.mySeed.get("ICount", "unknown")); prop.put("peerStatistics_juniorConnects", yacyCore.peerActions.juniorConnects); @@ -195,31 +196,6 @@ public class Status { return prop; } - public static String intervalToString(String minsAsString) - { - try { - long mins = Long.parseLong(minsAsString); - - StringBuffer uptime = new StringBuffer(); - - int uptimeDays = (int) (Math.floor(mins/1440)); - int uptimeHours = (int) (Math.floor(mins/60)%24); - int uptimeMins = (int) mins%60; - - uptime.append(uptimeDays) - .append(((uptimeDays == 1)?" day ":" days ")) - .append((uptimeHours < 10)?"0":"") - .append(uptimeHours) - .append(":") - .append((uptimeMins < 10)?"0":"") - .append(uptimeMins); - - return uptime.toString(); - } catch (Exception e) { - return "unknown"; - } - } - public static String bytesToString(long byteCount) { try { StringBuffer byteString = new StringBuffer(); diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java index 0749a6421..301bcab8c 100644 --- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java @@ -27,6 +27,7 @@ package de.anomic.htmlFilter; import java.util.HashSet; +import java.util.HashMap; import java.util.Properties; import de.anomic.server.serverByteBuffer; @@ -40,6 +41,255 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { private HashSet tags0; private HashSet tags1; + // define a translation table for html character codings + private static HashMap trans = new HashMap(300); + static { + trans.put(""", "\""); //Anführungszeichen oben + trans.put("&", "&"); //Ampersand-Zeichen, kaufmännisches Und + trans.put("<", "<"); //öffnende spitze Klammer + trans.put(">", ">"); //schließende spitze Klammer + trans.put(" ", " "); //Erzwungenes Leerzeichen + trans.put("¡", "!"); //umgekehrtes Ausrufezeichen + trans.put("¢", " cent "); //Cent-Zeichen + trans.put("£", " pound "); //Pfund-Zeichen + trans.put("¤", " currency "); //Währungs-Zeichen + trans.put("¥", " yen "); //Yen-Zeichen + trans.put("¦", " "); //durchbrochener Strich + trans.put("§", " paragraph "); //Paragraph-Zeichen + trans.put("¨", " "); //Pünktchen oben + trans.put("©", " copyright "); //Copyright-Zeichen + trans.put("ª", " "); //Ordinal-Zeichen weiblich + trans.put("«", " "); //angewinkelte Anführungszeichen links + trans.put("¬", " not "); //Verneinungs-Zeichen + trans.put("­", "-"); //kurzer Trennstrich + trans.put("®", " trademark "); //Registriermarke-Zeichen + trans.put("¯", " "); //Überstrich + trans.put("°", " degree "); //Grad-Zeichen + trans.put("±", " +/- "); //Plusminus-Zeichen + trans.put("²", " square "); //Hoch-2-Zeichen + trans.put("³", " 3 "); //Hoch-3-Zeichen + trans.put("´", " "); //Acute-Zeichen + trans.put("µ", " micro "); //Mikro-Zeichen + trans.put("¶", " paragraph "); //Absatz-Zeichen + trans.put("·", " "); //Mittelpunkt + trans.put("¸", " "); //Häkchen unten + trans.put("¹", " "); //Hoch-1-Zeichen + trans.put("º", " degree "); //Ordinal-Zeichen männlich + trans.put("»", " "); //angewinkelte Anführungszeichen rechts + trans.put("¼", " quarter "); //ein Viertel + trans.put("½", " half "); //ein Halb + trans.put("¾", " 3/4 "); //drei Viertel + trans.put("¿", "?"); //umgekehrtes Fragezeichen + trans.put("À", "A"); //A mit Accent grave + trans.put("Á", "A"); //A mit Accent acute + trans.put("Â", "A"); //A mit Circumflex + trans.put("Ã", "A"); //A mit Tilde + trans.put("Ä", "Ae"); //A Umlaut + trans.put("Å", "A"); //A mit Ring + trans.put("Æ", "A"); //A mit legiertem E + trans.put("Ç", "C"); //C mit Häkchen + trans.put("È", "E"); //E mit Accent grave + trans.put("É", "E"); //E mit Accent acute + trans.put("Ê", "E"); //E mit Circumflex + trans.put("Ë", "E"); //E Umlaut + trans.put("Ì", "I"); //I mit Accent grave + trans.put("Í", "I"); //I mit Accent acute + trans.put("Î", "I"); //I mit Circumflex + trans.put("Ï", "I"); //I Umlaut + trans.put("Ð", "D"); //Eth (isländisch) + trans.put("Ñ", "N"); //N mit Tilde + trans.put("Ò", "O"); //O mit Accent grave + trans.put("Ó", "O"); //O mit Accent acute + trans.put("Ô", "O"); //O mit Circumflex + trans.put("Õ", "O"); //O mit Tilde + trans.put("Ö", "Oe"); //O Umlaut + trans.put("×", " times "); //Mal-Zeichen + trans.put("Ø", "O"); //O mit Schrägstrich + trans.put("Ù", "U"); //U mit Accent grave + trans.put("Ú", "U"); //U mit Accent acute + trans.put("Û", "U"); //U mit Circumflex + trans.put("Ü", "Ue"); //U Umlaut + trans.put("Ý", "Y"); //Y mit Accent acute + trans.put("Þ", "P"); //THORN (isländisch) + trans.put("ß", "ss"); //scharfes S + trans.put("à", "a"); //a mit Accent grave + trans.put("á", "a"); //a mit Accent acute + trans.put("â", "a"); //a mit Circumflex + trans.put("ã", "a"); //a mit Tilde + trans.put("ä", "ae"); //a Umlaut + trans.put("å", "a"); //a mit Ring + trans.put("æ", "a"); //a mit legiertem e + trans.put("ç", "c"); //c mit Häkchen + trans.put("è", "e"); //e mit Accent grave + trans.put("é", "e"); //e mit Accent acute + trans.put("ê", "e"); //e mit Circumflex + trans.put("ë", "e"); //e Umlaut + trans.put("ì", "i"); //i mit Accent grave + trans.put("í", "i"); //i mit Accent acute + trans.put("î", "i"); //i mit Circumflex + trans.put("ï", "i"); //i Umlaut + trans.put("ð", "d"); //eth (isländisch) + trans.put("ñ", "n"); //n mit Tilde + trans.put("ò", "o"); //o mit Accent grave + trans.put("ó", "o"); //o mit Accent acute + trans.put("ô", "o"); //o mit Circumflex + trans.put("õ", "o"); //o mit Tilde + trans.put("ö", "oe"); //o Umlaut + trans.put("÷", "%"); //Divisions-Zeichen + trans.put("ø", "o"); //o mit Schrägstrich + trans.put("ù", "u"); //u mit Accent grave + trans.put("ú", "u"); //u mit Accent acute + trans.put("û", "u"); //u mit Circumflex + trans.put("ü", "ue"); //u Umlaut + trans.put("ý", "y"); //y mit Accent acute + trans.put("þ", "p"); //thorn (isländisch) + trans.put("ÿ", "y"); //y Umlaut + trans.put("Α", " Alpha "); //Alpha groß + trans.put("α", " alpha "); //alpha klein + trans.put("Β", " Beta "); //Beta groß + trans.put("β", " beta "); //beta klein + trans.put("Γ", " Gamma "); //Gamma groß + trans.put("γ", " gamma "); //gamma klein + trans.put("Δ", " Delta "); //Delta groß + trans.put("δ", " delta "); //delta klein + trans.put("Ε", " Epsilon "); //Epsilon groß + trans.put("ε", " epsilon "); //epsilon klein + trans.put("Ζ", " Zeta "); //Zeta groß + trans.put("ζ", " zeta "); //zeta klein + trans.put("Η", " Eta "); //Eta groß + trans.put("η", " eta "); //eta klein + trans.put("Θ", " Theta "); //Theta groß + trans.put("θ", " theta "); //theta klein + trans.put("Ι", " Iota "); //Iota groß + trans.put("ι", " iota "); //iota klein + trans.put("Κ", " Kappa "); //Kappa groß + trans.put("κ", " kappa "); //kappa klein + trans.put("Λ", " Lambda "); //Lambda groß + trans.put("λ", " lambda "); //lambda klein + trans.put("Μ", " Mu "); //Mu groß + trans.put("μ", " mu "); //mu klein + trans.put("Ν", " Nu "); //Nu groß + trans.put("ν", " nu "); //nu klein + trans.put("Ξ", " Xi "); //Xi groß + trans.put("ξ", " xi "); //xi klein + trans.put("Ο", " Omicron "); //Omicron groß + trans.put("ο", " omicron "); //omicron klein + trans.put("Π", " Pi "); //Pi groß + trans.put("π", " pi "); //pi klein + trans.put("Ρ", " Rho "); //Rho groß + trans.put("ρ", " rho "); //rho klein + trans.put("Σ", " Sigma "); //Sigma groß + trans.put("ς", " sigma "); //sigmaf klein + trans.put("σ", " sigma "); //sigma klein + trans.put("Τ", " Tau "); //Tau groß + trans.put("τ", " tau "); //tau klein + trans.put("Υ", " Ypsilon "); //Upsilon groß + trans.put("υ", " ypsilon "); //upsilon klein + trans.put("Φ", " Phi "); //Phi groß + trans.put("φ", " phi "); //phi klein + trans.put("Χ", " Chi "); //Chi groß + trans.put("χ", " chi "); //chi klein + trans.put("Ψ", " Psi "); //Psi groß + trans.put("ψ", " psi "); //psi klein + trans.put("Ω", " Omega "); //Omega groß + trans.put("ω", " omega "); //omega klein + trans.put("ϑ", " theta "); //theta Symbol + trans.put("ϒ", " ypsilon "); //upsilon mit Haken + trans.put("ϖ", " pi "); //pi Symbol + trans.put("∀", " for all "); //für alle + trans.put("∂", " part of "); //teilweise + trans.put("∃", " exists "); //existiert + trans.put("∅", " null "); //leer + trans.put("∇", " nabla "); //nabla + trans.put("∈", " element of "); //Element von + trans.put("∉", " not element of "); //kein Element von + trans.put("∋", " contains "); //enthält als Element + trans.put("∏", " product "); //Produkt + trans.put("∑", " sum "); //Summe + trans.put("−", " minus "); //minus + trans.put("∗", " times "); //Asterisk + trans.put("√", " sqare root "); //Quadratwurzel + trans.put("∝", " proportional to "); //proportional zu + trans.put("∞", " unlimited "); //unendlich + trans.put("∠", " angle "); //Winkel + trans.put("∧", " and "); //und + trans.put("∨", " or "); //oder + trans.put("∩", " "); //Schnittpunkt + trans.put("∪", " unity "); //Einheit + trans.put("∫", " integral "); //Integral + trans.put("∴", " cause "); //deshalb + trans.put("∼", " similar to "); //ähnlich wie + trans.put("≅", " equal "); //annähernd gleich + trans.put("≈", " equal "); //beinahe gleich + trans.put("≠", " not equal "); //ungleich + trans.put("≡", " identical "); //identisch mit + trans.put("≤", " smaller or equal than "); //kleiner gleich + trans.put("≥", " greater or equal than "); //größer gleich + trans.put("⊂", " subset of "); //Untermenge von + trans.put("⊃", " superset of "); //Obermenge von + trans.put("⊄", " not subset of "); //keine Untermenge von + trans.put("⊆", ""); //Untermenge von oder gleich mit + trans.put("⊇", ""); //Obermenge von oder gleich mit + trans.put("⊕", ""); //Direktsumme + trans.put("⊗", ""); //Vektorprodukt + trans.put("⊥", ""); //senkrecht zu + trans.put("⋅", ""); //Punkt-Operator + trans.put("◊", ""); //Raute + trans.put("⌈", ""); //links oben + trans.put("⌉", ""); //rechts oben + trans.put("⌊", ""); //links unten + trans.put("⌋", ""); //rechts unten + trans.put("⟨", ""); //spitze Klammer links + trans.put("⟩", ""); //spitze Klammer rechts + trans.put("←", ""); //Pfeil links + trans.put("↑", ""); //Pfeil oben + trans.put("→", ""); //Pfeil rechts + trans.put("↓", ""); //Pfeil unten + trans.put("↔", ""); //Pfeil links/rechts + trans.put("↵", ""); //Pfeil unten-Knick-links + trans.put("⇐", ""); //Doppelpfeil links + trans.put("⇑", ""); //Doppelpfeil oben + trans.put("⇒", ""); //Doppelpfeil rechts + trans.put("⇓", ""); //Doppelpfeil unten + trans.put("⇔", ""); //Doppelpfeil links/rechts + trans.put("•", ""); //Bullet-Zeichen + trans.put("…", ""); //Horizontale Ellipse + trans.put("′", ""); //Minutenzeichen + trans.put("‾", ""); //Überstrich + trans.put("⁄", ""); //Bruchstrich + trans.put("℘", ""); //Weierstrass p + trans.put("ℑ", ""); //Zeichen für "imaginär" + trans.put("ℜ", ""); //Zeichen für "real" + trans.put("™", ""); //Trademark-Zeichen + trans.put("€", ""); //Euro-Zeichen + trans.put("ℵ", ""); //Alef-Symbol + trans.put("♠", ""); //Pik-Zeichen + trans.put("♣", ""); //Kreuz-Zeichen + trans.put("♥", ""); //Herz-Zeichen + trans.put("♦", ""); //Karo-Zeichen + trans.put(" ", ""); //Leerzeichen Breite n + trans.put(" ", ""); //Leerzeichen Breite m + trans.put(" ", ""); //Schmales Leerzeichen + trans.put("‌", ""); //null breiter Nichtverbinder + trans.put("‍", ""); //null breiter Verbinder + trans.put("‎", ""); //links-nach-rechts-Zeichen + trans.put("‏", ""); //rechts-nach-links-Zeichen + trans.put("–", ""); //Gedankenstrich Breite n + trans.put("—", ""); //Gedankenstrich Breite m + trans.put("‘", ""); //einfaches Anführungszeichen links + trans.put("’", ""); //einfaches Anführungszeichen rechts + trans.put("‚", ""); //einfaches low-9-Zeichen + trans.put("“", ""); //doppeltes Anführungszeichen links + trans.put("”", ""); //doppeltes Anführungszeichen rechts + trans.put("„", ""); //doppeltes low-9-Zeichen rechts + trans.put("†", ""); //Kreuz + trans.put("‡", ""); //Doppelkreuz + trans.put("‰", ""); //zu tausend + trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links + trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts + } + + public htmlFilterAbstractScraper(HashSet tags0, HashSet tags1) { this.tags0 = tags0; this.tags1 = tags1; @@ -55,9 +305,6 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { //the 'missing' method that shall be implemented: public abstract void scrapeText(byte[] text); - /* could be easily implemented as: - { } - */ // the other methods must take into account to construct the return value correctly public void scrapeTag0(String tagname, Properties tagopts) { @@ -66,25 +313,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { public void scrapeTag1(String tagname, Properties tagopts, byte[] text) { } - protected static serverByteBuffer stripAllTags(serverByteBuffer bb) { - int p0, p1; - while ((p0 = bb.indexOf(lb)) >= 0) { - p1 = bb.indexOf(rb, p0); - if (p1 >= 0) { - bb = new serverByteBuffer(bb.getBytes(0, p0)).trim().append((byte) 32).append(new serverByteBuffer(bb.getBytes(p1 + 1)).trim()); - } else { - bb = new serverByteBuffer(bb.getBytes(0, p0)).trim().append(new serverByteBuffer(bb.getBytes(p0 + 1)).trim()); - } - } - return bb.trim(); - } - // string conversions - private static serverByteBuffer code_iso8859(byte c) { - String s = code_iso8859s(c); - if (s == null) return null; else return new serverByteBuffer(s.getBytes()); - } - private static String code_iso8859s(byte c) { switch ((int) c & 0xff) { @@ -127,279 +356,51 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { } public static serverByteBuffer convertUmlaute(serverByteBuffer bb) { - serverByteBuffer t = new serverByteBuffer(); - serverByteBuffer z; + serverByteBuffer t = new serverByteBuffer(bb.length() + 20); + byte b; + String z; for (int i = 0; i < bb.length(); i++) { - z = code_iso8859(bb.byteAt(i)); - t.append((z == null) ? (new serverByteBuffer().append(bb.byteAt(i))) : z); + b = bb.byteAt(i); + z = code_iso8859s(b); + if (z == null) t.append(b); else t.append(z); } return t; } - private static String transscripts(String code) { - if (code.equals(""")) return "\""; //Anführungszeichen oben - if (code.equals("&")) return "&"; //Ampersand-Zeichen, kaufmännisches Und - if (code.equals("<")) return "<"; //öffnende spitze Klammer - if (code.equals(">")) return ">"; //schließende spitze Klammer - if (code.equals(" ")) return " "; //Erzwungenes Leerzeichen - if (code.equals("¡")) return "!"; //umgekehrtes Ausrufezeichen - if (code.equals("¢")) return " cent "; //Cent-Zeichen - if (code.equals("£")) return " pound "; //Pfund-Zeichen - if (code.equals("¤")) return " currency "; //Währungs-Zeichen - if (code.equals("¥")) return " yen "; //Yen-Zeichen - if (code.equals("¦")) return " "; //durchbrochener Strich - if (code.equals("§")) return " paragraph "; //Paragraph-Zeichen - if (code.equals("¨")) return " "; //Pünktchen oben - if (code.equals("©")) return " copyright "; //Copyright-Zeichen - if (code.equals("ª")) return " "; //Ordinal-Zeichen weiblich - if (code.equals("«")) return " "; //angewinkelte Anführungszeichen links - if (code.equals("¬")) return " not "; //Verneinungs-Zeichen - if (code.equals("­")) return "-"; //kurzer Trennstrich - if (code.equals("®")) return " trademark "; //Registriermarke-Zeichen - if (code.equals("¯")) return " "; //Überstrich - if (code.equals("°")) return " degree "; //Grad-Zeichen - if (code.equals("±")) return " +/- "; //Plusminus-Zeichen - if (code.equals("²")) return " square "; //Hoch-2-Zeichen - if (code.equals("³")) return " 3 "; //Hoch-3-Zeichen - if (code.equals("´")) return " "; //Acute-Zeichen - if (code.equals("µ")) return " micro "; //Mikro-Zeichen - if (code.equals("¶")) return " paragraph "; //Absatz-Zeichen - if (code.equals("·")) return " "; //Mittelpunkt - if (code.equals("¸")) return " "; //Häkchen unten - if (code.equals("¹")) return " "; //Hoch-1-Zeichen - if (code.equals("º")) return " degree "; //Ordinal-Zeichen männlich - if (code.equals("»")) return " "; //angewinkelte Anführungszeichen rechts - if (code.equals("¼")) return " quarter "; //ein Viertel - if (code.equals("½")) return " half "; //ein Halb - if (code.equals("¾")) return " 3/4 "; //drei Viertel - if (code.equals("¿")) return "?"; //umgekehrtes Fragezeichen - if (code.equals("À")) return "A"; //A mit Accent grave - if (code.equals("Á")) return "A"; //A mit Accent acute - if (code.equals("Â")) return "A"; //A mit Circumflex - if (code.equals("Ã")) return "A"; //A mit Tilde - if (code.equals("Ä")) return "Ae"; //A Umlaut - if (code.equals("Å")) return "A"; //A mit Ring - if (code.equals("Æ")) return "A"; //A mit legiertem E - if (code.equals("Ç")) return "C"; //C mit Häkchen - if (code.equals("È")) return "E"; //E mit Accent grave - if (code.equals("É")) return "E"; //E mit Accent acute - if (code.equals("Ê")) return "E"; //E mit Circumflex - if (code.equals("Ë")) return "E"; //E Umlaut - if (code.equals("Ì")) return "I"; //I mit Accent grave - if (code.equals("Í")) return "I"; //I mit Accent acute - if (code.equals("Î")) return "I"; //I mit Circumflex - if (code.equals("Ï")) return "I"; //I Umlaut - if (code.equals("Ð")) return "D"; //Eth (isländisch) - if (code.equals("Ñ")) return "N"; //N mit Tilde - if (code.equals("Ò")) return "O"; //O mit Accent grave - if (code.equals("Ó")) return "O"; //O mit Accent acute - if (code.equals("Ô")) return "O"; //O mit Circumflex - if (code.equals("Õ")) return "O"; //O mit Tilde - if (code.equals("Ö")) return "Oe"; //O Umlaut - if (code.equals("×")) return " times "; //Mal-Zeichen - if (code.equals("Ø")) return "O"; //O mit Schrägstrich - if (code.equals("Ù")) return "U"; //U mit Accent grave - if (code.equals("Ú")) return "U"; //U mit Accent acute - if (code.equals("Û")) return "U"; //U mit Circumflex - if (code.equals("Ü")) return "Ue"; //U Umlaut - if (code.equals("Ý")) return "Y"; //Y mit Accent acute - if (code.equals("Þ")) return "P"; //THORN (isländisch) - if (code.equals("ß")) return "ss"; //scharfes S - if (code.equals("à")) return "a"; //a mit Accent grave - if (code.equals("á")) return "a"; //a mit Accent acute - if (code.equals("â")) return "a"; //a mit Circumflex - if (code.equals("ã")) return "a"; //a mit Tilde - if (code.equals("ä")) return "ae"; //a Umlaut - if (code.equals("å")) return "a"; //a mit Ring - if (code.equals("æ")) return "a"; //a mit legiertem e - if (code.equals("ç")) return "c"; //c mit Häkchen - if (code.equals("è")) return "e"; //e mit Accent grave - if (code.equals("é")) return "e"; //e mit Accent acute - if (code.equals("ê")) return "e"; //e mit Circumflex - if (code.equals("ë")) return "e"; //e Umlaut - if (code.equals("ì")) return "i"; //i mit Accent grave - if (code.equals("í")) return "i"; //i mit Accent acute - if (code.equals("î")) return "i"; //i mit Circumflex - if (code.equals("ï")) return "i"; //i Umlaut - if (code.equals("ð")) return "d"; //eth (isländisch) - if (code.equals("ñ")) return "n"; //n mit Tilde - if (code.equals("ò")) return "o"; //o mit Accent grave - if (code.equals("ó")) return "o"; //o mit Accent acute - if (code.equals("ô")) return "o"; //o mit Circumflex - if (code.equals("õ")) return "o"; //o mit Tilde - if (code.equals("ö")) return "oe"; //o Umlaut - if (code.equals("÷")) return "%"; //Divisions-Zeichen - if (code.equals("ø")) return "o"; //o mit Schrägstrich - if (code.equals("ù")) return "u"; //u mit Accent grave - if (code.equals("ú")) return "u"; //u mit Accent acute - if (code.equals("û")) return "u"; //u mit Circumflex - if (code.equals("ü")) return "ue"; //u Umlaut - if (code.equals("ý")) return "y"; //y mit Accent acute - if (code.equals("þ")) return "p"; //thorn (isländisch) - if (code.equals("ÿ")) return "y"; //y Umlaut - if (code.equals("Α")) return " Alpha "; //Alpha groß - if (code.equals("α")) return " alpha "; //alpha klein - if (code.equals("Β")) return " Beta "; //Beta groß - if (code.equals("β")) return " beta "; //beta klein - if (code.equals("Γ")) return " Gamma "; //Gamma groß - if (code.equals("γ")) return " gamma "; //gamma klein - if (code.equals("Δ")) return " Delta "; //Delta groß - if (code.equals("δ")) return " delta "; //delta klein - if (code.equals("Ε")) return " Epsilon "; //Epsilon groß - if (code.equals("ε")) return " epsilon "; //epsilon klein - if (code.equals("Ζ")) return " Zeta "; //Zeta groß - if (code.equals("ζ")) return " zeta "; //zeta klein - if (code.equals("Η")) return " Eta "; //Eta groß - if (code.equals("η")) return " eta "; //eta klein - if (code.equals("Θ")) return " Theta "; //Theta groß - if (code.equals("θ")) return " theta "; //theta klein - if (code.equals("Ι")) return " Iota "; //Iota groß - if (code.equals("ι")) return " iota "; //iota klein - if (code.equals("Κ")) return " Kappa "; //Kappa groß - if (code.equals("κ")) return " kappa "; //kappa klein - if (code.equals("Λ")) return " Lambda "; //Lambda groß - if (code.equals("λ")) return " lambda "; //lambda klein - if (code.equals("Μ")) return " Mu "; //Mu groß - if (code.equals("μ")) return " mu "; //mu klein - if (code.equals("Ν")) return " Nu "; //Nu groß - if (code.equals("ν")) return " nu "; //nu klein - if (code.equals("Ξ")) return " Xi "; //Xi groß - if (code.equals("ξ")) return " xi "; //xi klein - if (code.equals("Ο")) return " Omicron "; //Omicron groß - if (code.equals("ο")) return " omicron "; //omicron klein - if (code.equals("Π")) return " Pi "; //Pi groß - if (code.equals("π")) return " pi "; //pi klein - if (code.equals("Ρ")) return " Rho "; //Rho groß - if (code.equals("ρ")) return " rho "; //rho klein - if (code.equals("Σ")) return " Sigma "; //Sigma groß - if (code.equals("ς")) return " sigma "; //sigmaf klein - if (code.equals("σ")) return " sigma "; //sigma klein - if (code.equals("Τ")) return " Tau "; //Tau groß - if (code.equals("τ")) return " tau "; //tau klein - if (code.equals("Υ")) return " Ypsilon "; //Upsilon groß - if (code.equals("υ")) return " ypsilon "; //upsilon klein - if (code.equals("Φ")) return " Phi "; //Phi groß - if (code.equals("φ")) return " phi "; //phi klein - if (code.equals("Χ")) return " Chi "; //Chi groß - if (code.equals("χ")) return " chi "; //chi klein - if (code.equals("Ψ")) return " Psi "; //Psi groß - if (code.equals("ψ")) return " psi "; //psi klein - if (code.equals("Ω")) return " Omega "; //Omega groß - if (code.equals("ω")) return " omega "; //omega klein - if (code.equals("ϑ")) return " theta "; //theta Symbol - if (code.equals("ϒ")) return " ypsilon "; //upsilon mit Haken - if (code.equals("ϖ")) return " pi "; //pi Symbol - if (code.equals("∀")) return " for all "; //für alle - if (code.equals("∂")) return " part of "; //teilweise - if (code.equals("∃")) return " exists "; //existiert - if (code.equals("∅")) return " null "; //leer - if (code.equals("∇")) return " nabla "; //nabla - if (code.equals("∈")) return " element of "; //Element von - if (code.equals("∉")) return " not element of "; //kein Element von - if (code.equals("∋")) return " contains "; //enthält als Element - if (code.equals("∏")) return " product "; //Produkt - if (code.equals("∑")) return " sum "; //Summe - if (code.equals("−")) return " minus "; //minus - if (code.equals("∗")) return " times "; //Asterisk - if (code.equals("√")) return " sqare root "; //Quadratwurzel - if (code.equals("∝")) return " proportional to "; //proportional zu - if (code.equals("∞")) return " unlimited "; //unendlich - if (code.equals("∠")) return " angle "; //Winkel - if (code.equals("∧")) return " and "; //und - if (code.equals("∨")) return " or "; //oder - if (code.equals("∩")) return " "; //Schnittpunkt - if (code.equals("∪")) return " unity "; //Einheit - if (code.equals("∫")) return " integral "; //Integral - if (code.equals("∴")) return " cause "; //deshalb - if (code.equals("∼")) return " similar to "; //ähnlich wie - if (code.equals("≅")) return " equal "; //annähernd gleich - if (code.equals("≈")) return " equal "; //beinahe gleich - if (code.equals("≠")) return " not equal "; //ungleich - if (code.equals("≡")) return " identical "; //identisch mit - if (code.equals("≤")) return " smaller or equal than "; //kleiner gleich - if (code.equals("≥")) return " greater or equal than "; //größer gleich - if (code.equals("⊂")) return " subset of "; //Untermenge von - if (code.equals("⊃")) return " superset of "; //Obermenge von - if (code.equals("⊄")) return " not subset of "; //keine Untermenge von - if (code.equals("⊆")) return ""; //Untermenge von oder gleich mit - if (code.equals("⊇")) return ""; //Obermenge von oder gleich mit - if (code.equals("⊕")) return ""; //Direktsumme - if (code.equals("⊗")) return ""; //Vektorprodukt - if (code.equals("⊥")) return ""; //senkrecht zu - if (code.equals("⋅")) return ""; //Punkt-Operator - if (code.equals("◊")) return ""; //Raute - if (code.equals("⌈")) return ""; //links oben - if (code.equals("⌉")) return ""; //rechts oben - if (code.equals("⌊")) return ""; //links unten - if (code.equals("⌋")) return ""; //rechts unten - if (code.equals("⟨")) return ""; //spitze Klammer links - if (code.equals("⟩")) return ""; //spitze Klammer rechts - if (code.equals("←")) return ""; //Pfeil links - if (code.equals("↑")) return ""; //Pfeil oben - if (code.equals("→")) return ""; //Pfeil rechts - if (code.equals("↓")) return ""; //Pfeil unten - if (code.equals("↔")) return ""; //Pfeil links/rechts - if (code.equals("↵")) return ""; //Pfeil unten-Knick-links - if (code.equals("⇐")) return ""; //Doppelpfeil links - if (code.equals("⇑")) return ""; //Doppelpfeil oben - if (code.equals("⇒")) return ""; //Doppelpfeil rechts - if (code.equals("⇓")) return ""; //Doppelpfeil unten - if (code.equals("⇔")) return ""; //Doppelpfeil links/rechts - if (code.equals("•")) return ""; //Bullet-Zeichen - if (code.equals("…")) return ""; //Horizontale Ellipse - if (code.equals("′")) return ""; //Minutenzeichen - if (code.equals("‾")) return ""; //Überstrich - if (code.equals("⁄")) return ""; //Bruchstrich - if (code.equals("℘")) return ""; //Weierstrass p - if (code.equals("ℑ")) return ""; //Zeichen für "imaginär" - if (code.equals("ℜ")) return ""; //Zeichen für "real" - if (code.equals("™")) return ""; //Trademark-Zeichen - if (code.equals("€")) return ""; //Euro-Zeichen - if (code.equals("ℵ")) return ""; //Alef-Symbol - if (code.equals("♠")) return ""; //Pik-Zeichen - if (code.equals("♣")) return ""; //Kreuz-Zeichen - if (code.equals("♥")) return ""; //Herz-Zeichen - if (code.equals("♦")) return ""; //Karo-Zeichen - if (code.equals(" ")) return ""; //Leerzeichen Breite n - if (code.equals(" ")) return ""; //Leerzeichen Breite m - if (code.equals(" ")) return ""; //Schmales Leerzeichen - if (code.equals("‌")) return ""; //null breiter Nichtverbinder - if (code.equals("‍")) return ""; //null breiter Verbinder - if (code.equals("‎")) return ""; //links-nach-rechts-Zeichen - if (code.equals("‏")) return ""; //rechts-nach-links-Zeichen - if (code.equals("–")) return ""; //Gedankenstrich Breite n - if (code.equals("—")) return ""; //Gedankenstrich Breite m - if (code.equals("‘")) return ""; //einfaches Anführungszeichen links - if (code.equals("’")) return ""; //einfaches Anführungszeichen rechts - if (code.equals("‚")) return ""; //einfaches low-9-Zeichen - if (code.equals("“")) return ""; //doppeltes Anführungszeichen links - if (code.equals("”")) return ""; //doppeltes Anführungszeichen rechts - if (code.equals("„")) return ""; //doppeltes low-9-Zeichen rechts - if (code.equals("†")) return ""; //Kreuz - if (code.equals("‡")) return ""; //Doppelkreuz - if (code.equals("‰")) return ""; //zu tausend - if (code.equals("‹")) return ""; //angewinkeltes einzelnes Anf.zeichen links - if (code.equals("›")) return ""; //angewinkeltes einzelnes Anf.zeichen rechts - - return ""; - } - private static byte[] transscript(byte[] code) { - return transscripts(new String(code)).getBytes(); + String t = (String) trans.get(new String(code)); + if (t == null) return new byte[0]; else return t.getBytes(); } protected static serverByteBuffer transscriptAll(serverByteBuffer bb) { - int p0, p1; - while ((p0 = bb.indexOf((byte) '&')) >= 0) { + int p0 = 0, p1; + byte[] t; + while ((p0 = bb.indexOf((byte) '&', p0)) >= 0) { p1 = bb.indexOf((byte) ';', p0); - if (p1 >= 0) - bb = new serverByteBuffer(bb.getBytes(0, p0)).append(transscript(bb.getBytes(p0, p1 + 1))).append(bb.getBytes(p1 + 1)); - else - bb = new serverByteBuffer(bb.getBytes(0, p0)).append(bb.getBytes(p0 + 1)); + if (p1 >= 0) { + t = transscript(bb.getBytes(p0, p1 + 1)); + bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getBytes(p1 + 1)); + } else { + bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).append(bb.getBytes(p0 + 1)); + } } + t = null; return bb; } + protected static serverByteBuffer stripAllTags(serverByteBuffer bb) { + int p0 = 0, p1; + while ((p0 = bb.indexOf(lb, p0)) >= 0) { + p1 = bb.indexOf(rb, p0); + if (p1 >= 0) { + bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + 1).trim().append((byte) 32).append(new serverByteBuffer(bb.getBytes(p1 + 1)).trim()); + } else { + bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).trim().append(new serverByteBuffer(bb.getBytes(p0 + 1)).trim()); + } + } + return bb.trim(); + } + public static serverByteBuffer stripAll(serverByteBuffer bb) { //return stripAllTags(s); return convertUmlaute(transscriptAll(stripAllTags(bb))); diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index c8a1ee39b..a4751847d 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -83,14 +83,13 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen this.images = new HashMap(); this.title = ""; this.headline = ""; - this.text = new serverByteBuffer(); + this.text = new serverByteBuffer(1024); } - public void scrapeText(byte[] newtext) { //System.out.println("SCRAPE: " + new String(newtext)); - if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append((byte) 32); - text.append(new serverByteBuffer(super.stripAll(new serverByteBuffer(newtext))).trim()).append((byte) ' '); + if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append(32); + text.append(super.stripAll(new serverByteBuffer(newtext, newtext.length + 1)).trim()).append(32); } public static String urlNormalform(URL url) { @@ -122,12 +121,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen public void scrapeTag1(String tagname, Properties tagopts, byte[] text) { //System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text)); - //if (tagname.equals("a")) anchors.put(absolutePath(tagopts.getProperty("href", "")), new serverByteBuffer(super.stripAll(new serverByteBuffer(text)).getBytes()).trim().toString()); - //if (tagname.equals("h1")) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes()); - //if (tagname.equals("title")) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes()); - if ((tagname.equals("a")) && (text.length < 2048)) anchors.put(absolutePath(tagopts.getProperty("href", "")), new serverByteBuffer(super.stripAll(new serverByteBuffer(text)).getBytes()).trim().toString()); - if ((tagname.equals("h1")) && (text.length < 512)) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes()); - if ((tagname.equals("title")) && (text.length < 512)) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes()); + if ((tagname.equals("a")) && (text.length < 2048)) anchors.put(absolutePath(tagopts.getProperty("href", "")), super.stripAll(new serverByteBuffer(text)).trim().toString()); + if ((tagname.equals("h1")) && (text.length < 512)) headline = super.stripAll(new serverByteBuffer(text)).toString(); + if ((tagname.equals("title")) && (text.length < 512)) title = super.stripAll(new serverByteBuffer(text)).toString(); } @@ -138,7 +134,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen if (title.length() > 0) hl = title.trim(); else if (headline.length() > 0) hl = headline.trim(); else if (text.length() > 80) hl = new String(text.getBytes(), 0, 80).trim(); - else hl = text.toString().trim(); + else hl = text.trim().toString(); // clean the line: may contain too many funny symbols for (int i = 0; i < hl.length(); i++) diff --git a/source/de/anomic/htmlFilter/htmlFilterOutputStream.java b/source/de/anomic/htmlFilter/htmlFilterOutputStream.java index df3037fb6..5408f9612 100644 --- a/source/de/anomic/htmlFilter/htmlFilterOutputStream.java +++ b/source/de/anomic/htmlFilter/htmlFilterOutputStream.java @@ -91,7 +91,7 @@ public final class htmlFilterOutputStream extends OutputStream { this.out = out; this.scraper = scraper; this.transformer = transformer; - this.buffer = new serverByteBuffer(); + this.buffer = new serverByteBuffer(1024); this.filterTag = null; this.filterOpts = null; this.filterCont = null; @@ -105,7 +105,7 @@ public final class htmlFilterOutputStream extends OutputStream { public static byte[] genTag0raw(String tagname, boolean opening, byte[] tagopts) { - serverByteBuffer bb = new serverByteBuffer(); + serverByteBuffer bb = new serverByteBuffer(tagname.length() + tagopts.length + 3); bb.append((byte) '<'); if (!(opening)) bb.append((byte) '/'); bb.append(tagname.getBytes()); @@ -119,7 +119,7 @@ public final class htmlFilterOutputStream extends OutputStream { } public static byte[] genTag1raw(String tagname, byte[] tagopts, byte[] text) { - serverByteBuffer bb = new serverByteBuffer(); + serverByteBuffer bb = new serverByteBuffer(2 * tagname.length() + tagopts.length + text.length + 5); bb.append((byte) '<').append(tagname.getBytes()); if (tagopts.length > 0) { //if (tagopts[0] == (byte) 32) @@ -132,22 +132,23 @@ public final class htmlFilterOutputStream extends OutputStream { return bb.getBytes(); } - public static byte[] genTag0(String tagname, Properties tagopts, byte quotechar) { - serverByteBuffer bb = new serverByteBuffer().append((byte) '<').append(tagname.getBytes()); - if (tagopts.size() != 0) bb = bb.append((byte) 32).append(genOpts(tagopts, quotechar)); + byte[] tagoptsx = (tagopts.size() == 0) ? null : genOpts(tagopts, quotechar); + serverByteBuffer bb = new serverByteBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2).append((byte) '<').append(tagname.getBytes()); + if (tagoptsx != null) bb = bb.append((byte) 32).append(tagoptsx); bb = bb.append((byte) '>'); return bb.getBytes(); } public static byte[] genTag1(String tagname, Properties tagopts, byte[] text, byte quotechar) { - return new serverByteBuffer(genTag0(tagname, tagopts, quotechar)).append(text).append(("").getBytes()).getBytes(); + byte[] gt0 = genTag0(tagname, tagopts, quotechar); + return new serverByteBuffer(gt0, gt0.length + text.length + tagname.length() + 3).append(text).append(("").getBytes()).getBytes(); } // a helper method for pretty-printing of properties for html tags public static byte[] genOpts(Properties prop, byte quotechar) { Enumeration e = prop.propertyNames(); - serverByteBuffer bb = new serverByteBuffer(); + serverByteBuffer bb = new serverByteBuffer(prop.size() * 40); String key; while (e.hasMoreElements()) { key = (String) e.nextElement(); diff --git a/source/de/anomic/http/httpc.java b/source/de/anomic/http/httpc.java index aae438aa1..76a2425c7 100644 --- a/source/de/anomic/http/httpc.java +++ b/source/de/anomic/http/httpc.java @@ -164,7 +164,7 @@ public final class httpc { * A reusable readline buffer * @see serverByteBuffer */ - final serverByteBuffer readLineBuffer = new serverByteBuffer(); + final serverByteBuffer readLineBuffer = new serverByteBuffer(100); public String toString() { return (this.savedRemoteHost == null) ? "Disconnected" : "Connected to " + this.savedRemoteHost + diff --git a/source/de/anomic/http/httpd.java b/source/de/anomic/http/httpd.java index 7ffdd5791..95c9db869 100644 --- a/source/de/anomic/http/httpd.java +++ b/source/de/anomic/http/httpd.java @@ -1098,7 +1098,7 @@ public final class httpd implements serverHandler { // building the stacktrace if (stackTrace != null) { - serverByteBuffer errorMsg = new serverByteBuffer(); + serverByteBuffer errorMsg = new serverByteBuffer(100); errorMsg.append("Exception occurred:\r\n\r\n") .append(stackTrace.toString()) .append("\r\n") diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index d23fd613a..551e8eb60 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -604,19 +604,24 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // do a local crawl plasmaCrawlNURL.entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE); - if (urlEntry.url() == null) return false; + String stats = "LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; + if (urlEntry.url() == null) { + log.logError(stats + ": urlEntry.url() == null"); + return true; + } String profileHandle = urlEntry.profileHandle(); //System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle); if (profile == null) { - log.logError("LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); - return false; + log.logError(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); + return true; } log.logDebug("LOCALCRAWL: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter() + ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false"))); - return processLocalCrawling(urlEntry, profile); + processLocalCrawling(urlEntry, profile, stats); + return true; } public int limitCrawlTriggerJobSize() { @@ -629,7 +634,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser return false; } // if the server is busy, we do crawling more slowly - if (!(cacheManager.idle())) try {Thread.currentThread().sleep(2000);} catch (InterruptedException e) {} + //if (!(cacheManager.idle())) try {Thread.currentThread().sleep(2000);} catch (InterruptedException e) {} // if crawling was paused we have to wait until we wer notified to continue synchronized(this.crawlingPausedSync) { @@ -643,13 +648,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // start a global crawl, if possible plasmaCrawlNURL.entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT); - if (urlEntry.url() == null) return true; + String stats = "REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; + if (urlEntry.url() == null) { + log.logError(stats + ": urlEntry.url() == null"); + return true; + } String profileHandle = urlEntry.profileHandle(); //System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle); if (profile == null) { - log.logError("REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); - return false; + log.logError(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); + return true; } log.logDebug("plasmaSwitchboard.limitCrawlTriggerJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter() + @@ -679,7 +688,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser return false; } - processLocalCrawling(urlEntry, profile); + processLocalCrawling(urlEntry, profile, stats); return true; } @@ -710,7 +719,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser */ // if the server is busy, we do this more slowly - if (!(cacheManager.idle())) try {Thread.currentThread().sleep(2000);} catch (InterruptedException e) {} + //if (!(cacheManager.idle())) try {Thread.currentThread().sleep(2000);} catch (InterruptedException e) {} // if crawling was paused we have to wait until we wer notified to continue synchronized(this.crawlingPausedSync) { @@ -724,19 +733,25 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // we don't want to crawl a global URL globally, since WE are the global part. (from this point of view) plasmaCrawlNURL.entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE); - if (urlEntry.url() == null) return false; + String stats = "REMOTETRIGGEREDCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; + if (urlEntry.url() == null) { + log.logError(stats + ": urlEntry.url() == null"); + return false; + } String profileHandle = urlEntry.profileHandle(); //System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle); + if (profile == null) { - log.logError("REMOTETRIGGEREDCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); + log.logError(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); return false; } log.logDebug("plasmaSwitchboard.remoteTriggeredCrawlJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter() + ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false"))); - return processLocalCrawling(urlEntry, profile); + processLocalCrawling(urlEntry, profile, stats); + return true; } private void processResourceStack(plasmaHTCache.Entry entry) { @@ -1000,21 +1015,22 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (u == null) return plasmaURL.dummyHash; else return u.toString(); } - private boolean processLocalCrawling(plasmaCrawlNURL.entry urlEntry, plasmaCrawlProfile.entry profile) { + private void processLocalCrawling(plasmaCrawlNURL.entry urlEntry, plasmaCrawlProfile.entry profile, String stats) { // work off one Crawl stack entry if ((urlEntry == null) && (urlEntry.url() == null)) { - log.logInfo("LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null"); - return false; + log.logInfo(stats + ": urlEntry=null"); + return; } cacheLoader.loadParallel(urlEntry.url(), urlEntry.referrerHash(), urlEntry.initiator(), urlEntry.depth(), profile); - log.logInfo("LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: enqueued for load " + urlEntry.url()); - return true; + log.logInfo(stats + ": enqueued for load " + urlEntry.url()); + return; } private boolean processRemoteCrawlTrigger(plasmaCrawlNURL.entry urlEntry) { + // return true iff another peer has/will index(ed) the url if (urlEntry == null) { log.logInfo("REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null"); - return false; + return true; // superfluous request; true correct in this context } // are we qualified? @@ -1027,19 +1043,22 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // check url if (urlEntry.url() == null) { log.logDebug("ERROR: plasmaSwitchboard.processRemoteCrawlTrigger - url is null. name=" + urlEntry.name()); - return false; + return true; } String nexturlString = urlEntry.url().toString(); String urlhash = plasmaURL.urlHash(urlEntry.url()); // check remote crawl yacySeed remoteSeed = yacyCore.dhtAgent.getCrawlSeed(urlhash); + if (remoteSeed == null) { log.logDebug("plasmaSwitchboard.processRemoteCrawlTrigger: no remote crawl seed available"); - return false; + return false; } + // do the request HashMap page = yacyClient.crawlOrder(remoteSeed, nexturlString, hash2urlstring(urlEntry.referrerHash()), 0); + // check success /* @@ -1060,7 +1079,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser */ if ((page == null) || (page.get("delay") == null)) { log.logInfo("CRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CAUSE: unknown (URL=" + nexturlString + ")"); - yacyCore.peerActions.peerDeparture(remoteSeed); + if (remoteSeed != null) yacyCore.peerActions.peerDeparture(remoteSeed); return false; } else try { log.logDebug("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed=" + remoteSeed.getName() + ", url=" + nexturlString + ", response=" + page.toString()); // DEBUG @@ -1093,7 +1112,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser e.printStackTrace(); return false; } - } diff --git a/source/de/anomic/server/serverByteBuffer.java b/source/de/anomic/server/serverByteBuffer.java index 450fd34ab..70cd5693e 100644 --- a/source/de/anomic/server/serverByteBuffer.java +++ b/source/de/anomic/server/serverByteBuffer.java @@ -57,12 +57,13 @@ public final class serverByteBuffer extends OutputStream { private int offset; private int length; + public serverByteBuffer() { buffer = new byte[80]; length = 0; offset = 0; } - + public serverByteBuffer(int initLength) { this.buffer = new byte[initLength]; this.length = 0; @@ -75,6 +76,13 @@ public final class serverByteBuffer extends OutputStream { offset = 0; } + public serverByteBuffer(byte[] bb, int initLength) { + this.buffer = new byte[initLength]; + System.arraycopy(bb, 0, buffer, 0, bb.length); + length = bb.length; + offset = 0; + } + public serverByteBuffer(byte[] bb, int of, int le) { if (of * 2 > bb.length) { buffer = new byte[le]; @@ -123,7 +131,9 @@ public final class serverByteBuffer extends OutputStream { } private void grow() { - byte[] tmp = new byte[buffer.length * 2 + 1]; + int newsize = buffer.length * 2 + 1; + if (newsize < 256) newsize = 256; + byte[] tmp = new byte[newsize]; System.arraycopy(buffer, offset, tmp, 0, length); buffer = tmp; tmp = null; @@ -154,6 +164,11 @@ public final class serverByteBuffer extends OutputStream { return this; } + public serverByteBuffer append(int i) { + write((byte) (i & 0xFF)); + return this; + } + public serverByteBuffer append(byte[] bb) { write(bb); return this; @@ -237,7 +252,7 @@ public final class serverByteBuffer extends OutputStream { } public String toString() { - return new String(getBytes()); + return new String(getBytes(), offset, length); } public Properties propParser() { diff --git a/source/de/anomic/server/serverDate.java b/source/de/anomic/server/serverDate.java index ab9bc73de..c53684400 100644 --- a/source/de/anomic/server/serverDate.java +++ b/source/de/anomic/server/serverDate.java @@ -227,6 +227,30 @@ public final class serverDate { return testSFormatter.format(gregorian.getTime()); } + public static String intervalToString(long millis) { + try { + long mins = millis / 60000; + + StringBuffer uptime = new StringBuffer(); + + int uptimeDays = (int) (Math.floor(mins/1440)); + int uptimeHours = (int) (Math.floor(mins/60)%24); + int uptimeMins = (int) mins%60; + + uptime.append(uptimeDays) + .append(((uptimeDays == 1)?" day ":" days ")) + .append((uptimeHours < 10)?"0":"") + .append(uptimeHours) + .append(":") + .append((uptimeMins < 10)?"0":"") + .append(uptimeMins); + + return uptime.toString(); + } catch (Exception e) { + return "unknown"; + } + } + public static void main(String[] args) { //System.out.println("kelondroDate is (" + new kelondroDate().toString() + ")"); System.out.println("serverDate : " + new serverDate().toShortString(false));