enhanced html parser speed

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@290 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 5a490aa065
commit 5d06ded005

@ -48,6 +48,7 @@ import java.util.Enumeration;
import de.anomic.http.httpHeader;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.serverDate;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
@ -101,7 +102,7 @@ public class Network {
accActWords += words;
}
prop.put("table_my-version", seed.get("Version", "-"));
prop.put("table_my-uptime", Status.intervalToString(seed.get("Uptime", "-")));
prop.put("table_my-uptime", serverDate.intervalToString(60000 * Long.parseLong(seed.get("Uptime", ""))));
prop.put("table_my-links", groupDigits(links));
prop.put("table_my-words", groupDigits(words));
prop.put("table_my-acceptcrawl", "" + (seed.getFlagAcceptRemoteCrawl() ? 1 : 0) );
@ -210,7 +211,7 @@ public class Network {
prop.put("table_list_"+conCount+"_version", seed.get("Version", "-"));
prop.put("table_list_"+conCount+"_contact", (seed.getFlagDirectConnect() ? 1 : 0) );
prop.put("table_list_"+conCount+"_lastSeen", lastSeen(seed.get("LastSeen", "-")) );
prop.put("table_list_"+conCount+"_uptime", Status.intervalToString(seed.get("Uptime", "-")));
prop.put("table_list_"+conCount+"_uptime", serverDate.intervalToString(60000 * Long.parseLong(seed.get("Uptime", "0"))));
prop.put("table_list_"+conCount+"_links", groupDigits(links));
prop.put("table_list_"+conCount+"_words", groupDigits(words));
prop.put("table_list_"+conCount+"_acceptcrawl", (seed.getFlagAcceptRemoteCrawl() ? 1 : 0) );

@ -51,6 +51,7 @@ import de.anomic.http.httpdByteCountOutputStream;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.serverDate;
import de.anomic.yacy.yacyCore;
public class Status {
@ -112,7 +113,7 @@ public class Status {
prop.put("peerStatistics", 0);//unknown
} else {
prop.put("peerStatistics", 1);
prop.put("peerStatistics_uptime", intervalToString(yacyCore.seedDB.mySeed.get("Uptime", "unknown")));
prop.put("peerStatistics_uptime", serverDate.intervalToString(60000 * Long.parseLong(yacyCore.seedDB.mySeed.get("Uptime", "0"))));
prop.put("peerStatistics_links", yacyCore.seedDB.mySeed.get("LCount", "unknown"));
prop.put("peerStatistics_words", yacyCore.seedDB.mySeed.get("ICount", "unknown"));
prop.put("peerStatistics_juniorConnects", yacyCore.peerActions.juniorConnects);
@ -195,31 +196,6 @@ public class Status {
return prop;
}
public static String intervalToString(String minsAsString)
{
try {
long mins = Long.parseLong(minsAsString);
StringBuffer uptime = new StringBuffer();
int uptimeDays = (int) (Math.floor(mins/1440));
int uptimeHours = (int) (Math.floor(mins/60)%24);
int uptimeMins = (int) mins%60;
uptime.append(uptimeDays)
.append(((uptimeDays == 1)?" day ":" days "))
.append((uptimeHours < 10)?"0":"")
.append(uptimeHours)
.append(":")
.append((uptimeMins < 10)?"0":"")
.append(uptimeMins);
return uptime.toString();
} catch (Exception e) {
return "unknown";
}
}
public static String bytesToString(long byteCount) {
try {
StringBuffer byteString = new StringBuffer();

@ -27,6 +27,7 @@
package de.anomic.htmlFilter;
import java.util.HashSet;
import java.util.HashMap;
import java.util.Properties;
import de.anomic.server.serverByteBuffer;
@ -40,6 +41,255 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
private HashSet tags0;
private HashSet tags1;
// define a translation table for html character codings
private static HashMap trans = new HashMap(300);
static {
trans.put("&quot;", "\""); //Anf&uuml;hrungszeichen oben
trans.put("&amp;", "&"); //Ampersand-Zeichen, kaufm&auml;nnisches Und
trans.put("&lt;", "<"); //&ouml;ffnende spitze Klammer
trans.put("&gt;", ">"); //schlie&szlig;ende spitze Klammer
trans.put("&nbsp;", " "); //Erzwungenes Leerzeichen
trans.put("&iexcl;", "!"); //umgekehrtes Ausrufezeichen
trans.put("&cent;", " cent "); //Cent-Zeichen
trans.put("&pound;", " pound "); //Pfund-Zeichen
trans.put("&curren;", " currency "); //W&auml;hrungs-Zeichen
trans.put("&yen;", " yen "); //Yen-Zeichen
trans.put("&brvbar;", " "); //durchbrochener Strich
trans.put("&sect;", " paragraph "); //Paragraph-Zeichen
trans.put("&uml;", " "); //P&uuml;nktchen oben
trans.put("&copy;", " copyright "); //Copyright-Zeichen
trans.put("&ordf;", " "); //Ordinal-Zeichen weiblich
trans.put("&laquo;", " "); //angewinkelte Anf&uuml;hrungszeichen links
trans.put("&not;", " not "); //Verneinungs-Zeichen
trans.put("&shy;", "-"); //kurzer Trennstrich
trans.put("&reg;", " trademark "); //Registriermarke-Zeichen
trans.put("&macr;", " "); //&Uuml;berstrich
trans.put("&deg;", " degree "); //Grad-Zeichen
trans.put("&plusmn;", " +/- "); //Plusminus-Zeichen
trans.put("&sup2;", " square "); //Hoch-2-Zeichen
trans.put("&sup3;", " 3 "); //Hoch-3-Zeichen
trans.put("&acute;", " "); //Acute-Zeichen
trans.put("&micro;", " micro "); //Mikro-Zeichen
trans.put("&para;", " paragraph "); //Absatz-Zeichen
trans.put("&middot;", " "); //Mittelpunkt
trans.put("&cedil;", " "); //H&auml;kchen unten
trans.put("&sup1;", " "); //Hoch-1-Zeichen
trans.put("&ordm;", " degree "); //Ordinal-Zeichen m&auml;nnlich
trans.put("&raquo;", " "); //angewinkelte Anf&uuml;hrungszeichen rechts
trans.put("&frac14;", " quarter "); //ein Viertel
trans.put("&frac12;", " half "); //ein Halb
trans.put("&frac34;", " 3/4 "); //drei Viertel
trans.put("&iquest;", "?"); //umgekehrtes Fragezeichen
trans.put("&Agrave;", "A"); //A mit Accent grave
trans.put("&Aacute;", "A"); //A mit Accent acute
trans.put("&Acirc;", "A"); //A mit Circumflex
trans.put("&Atilde;", "A"); //A mit Tilde
trans.put("&Auml;", "Ae"); //A Umlaut
trans.put("&Aring;", "A"); //A mit Ring
trans.put("&AElig;", "A"); //A mit legiertem E
trans.put("&Ccedil;", "C"); //C mit H&auml;kchen
trans.put("&Egrave;", "E"); //E mit Accent grave
trans.put("&Eacute;", "E"); //E mit Accent acute
trans.put("&Ecirc;", "E"); //E mit Circumflex
trans.put("&Euml;", "E"); //E Umlaut
trans.put("&Igrave;", "I"); //I mit Accent grave
trans.put("&Iacute;", "I"); //I mit Accent acute
trans.put("&Icirc;", "I"); //I mit Circumflex
trans.put("&Iuml;", "I"); //I Umlaut
trans.put("&ETH;", "D"); //Eth (isl&auml;ndisch)
trans.put("&Ntilde;", "N"); //N mit Tilde
trans.put("&Ograve;", "O"); //O mit Accent grave
trans.put("&Oacute;", "O"); //O mit Accent acute
trans.put("&Ocirc;", "O"); //O mit Circumflex
trans.put("&Otilde;", "O"); //O mit Tilde
trans.put("&Ouml;", "Oe"); //O Umlaut
trans.put("&times;", " times "); //Mal-Zeichen
trans.put("&Oslash;", "O"); //O mit Schr&auml;gstrich
trans.put("&Ugrave;", "U"); //U mit Accent grave
trans.put("&Uacute;", "U"); //U mit Accent acute
trans.put("&Ucirc;", "U"); //U mit Circumflex
trans.put("&Uuml;", "Ue"); //U Umlaut
trans.put("&Yacute;", "Y"); //Y mit Accent acute
trans.put("&THORN;", "P"); //THORN (isl&auml;ndisch)
trans.put("&szlig;", "ss"); //scharfes S
trans.put("&agrave;", "a"); //a mit Accent grave
trans.put("&aacute;", "a"); //a mit Accent acute
trans.put("&acirc;", "a"); //a mit Circumflex
trans.put("&atilde;", "a"); //a mit Tilde
trans.put("&auml;", "ae"); //a Umlaut
trans.put("&aring;", "a"); //a mit Ring
trans.put("&aelig;", "a"); //a mit legiertem e
trans.put("&ccedil;", "c"); //c mit H&auml;kchen
trans.put("&egrave;", "e"); //e mit Accent grave
trans.put("&eacute;", "e"); //e mit Accent acute
trans.put("&ecirc;", "e"); //e mit Circumflex
trans.put("&euml;", "e"); //e Umlaut
trans.put("&igrave;", "i"); //i mit Accent grave
trans.put("&iacute;", "i"); //i mit Accent acute
trans.put("&icirc;", "i"); //i mit Circumflex
trans.put("&iuml;", "i"); //i Umlaut
trans.put("&eth;", "d"); //eth (isl&auml;ndisch)
trans.put("&ntilde;", "n"); //n mit Tilde
trans.put("&ograve;", "o"); //o mit Accent grave
trans.put("&oacute;", "o"); //o mit Accent acute
trans.put("&ocirc;", "o"); //o mit Circumflex
trans.put("&otilde;", "o"); //o mit Tilde
trans.put("&ouml;", "oe"); //o Umlaut
trans.put("&divide;", "%"); //Divisions-Zeichen
trans.put("&oslash;", "o"); //o mit Schr&auml;gstrich
trans.put("&ugrave;", "u"); //u mit Accent grave
trans.put("&uacute;", "u"); //u mit Accent acute
trans.put("&ucirc;", "u"); //u mit Circumflex
trans.put("&uuml;", "ue"); //u Umlaut
trans.put("&yacute;", "y"); //y mit Accent acute
trans.put("&thorn;", "p"); //thorn (isl&auml;ndisch)
trans.put("&yuml;", "y"); //y Umlaut
trans.put("&Alpha;", " Alpha "); //Alpha gro&szlig;
trans.put("&alpha;", " alpha "); //alpha klein
trans.put("&Beta;", " Beta "); //Beta gro&szlig;
trans.put("&beta;", " beta "); //beta klein
trans.put("&Gamma;", " Gamma "); //Gamma gro&szlig;
trans.put("&gamma;", " gamma "); //gamma klein
trans.put("&Delta;", " Delta "); //Delta gro&szlig;
trans.put("&delta;", " delta "); //delta klein
trans.put("&Epsilon;", " Epsilon "); //Epsilon gro&szlig;
trans.put("&epsilon;", " epsilon "); //epsilon klein
trans.put("&Zeta;", " Zeta "); //Zeta gro&szlig;
trans.put("&zeta;", " zeta "); //zeta klein
trans.put("&Eta;", " Eta "); //Eta gro&szlig;
trans.put("&eta;", " eta "); //eta klein
trans.put("&Theta;", " Theta "); //Theta gro&szlig;
trans.put("&theta;", " theta "); //theta klein
trans.put("&Iota;", " Iota "); //Iota gro&szlig;
trans.put("&iota;", " iota "); //iota klein
trans.put("&Kappa;", " Kappa "); //Kappa gro&szlig;
trans.put("&kappa;", " kappa "); //kappa klein
trans.put("&Lambda;", " Lambda "); //Lambda gro&szlig;
trans.put("&lambda;", " lambda "); //lambda klein
trans.put("&Mu;", " Mu "); //Mu gro&szlig;
trans.put("&mu;", " mu "); //mu klein
trans.put("&Nu;", " Nu "); //Nu gro&szlig;
trans.put("&nu;", " nu "); //nu klein
trans.put("&Xi;", " Xi "); //Xi gro&szlig;
trans.put("&xi;", " xi "); //xi klein
trans.put("&Omicron;", " Omicron "); //Omicron gro&szlig;
trans.put("&omicron;", " omicron "); //omicron klein
trans.put("&Pi;", " Pi "); //Pi gro&szlig;
trans.put("&pi;", " pi "); //pi klein
trans.put("&Rho;", " Rho "); //Rho gro&szlig;
trans.put("&rho;", " rho "); //rho klein
trans.put("&Sigma;", " Sigma "); //Sigma gro&szlig;
trans.put("&sigmaf;", " sigma "); //sigmaf klein
trans.put("&sigma;", " sigma "); //sigma klein
trans.put("&Tau;", " Tau "); //Tau gro&szlig;
trans.put("&tau;", " tau "); //tau klein
trans.put("&Upsilon;", " Ypsilon "); //Upsilon gro&szlig;
trans.put("&upsilon;", " ypsilon "); //upsilon klein
trans.put("&Phi;", " Phi "); //Phi gro&szlig;
trans.put("&phi;", " phi "); //phi klein
trans.put("&Chi;", " Chi "); //Chi gro&szlig;
trans.put("&chi;", " chi "); //chi klein
trans.put("&Psi;", " Psi "); //Psi gro&szlig;
trans.put("&psi;", " psi "); //psi klein
trans.put("&Omega;", " Omega "); //Omega gro&szlig;
trans.put("&omega;", " omega "); //omega klein
trans.put("&thetasym;", " theta "); //theta Symbol
trans.put("&upsih;", " ypsilon "); //upsilon mit Haken
trans.put("&piv;", " pi "); //pi Symbol
trans.put("&forall;", " for all "); //f&uuml;r alle
trans.put("&part;", " part of "); //teilweise
trans.put("&exist;", " exists "); //existiert
trans.put("&empty;", " null "); //leer
trans.put("&nabla;", " nabla "); //nabla
trans.put("&isin;", " element of "); //Element von
trans.put("&notin;", " not element of "); //kein Element von
trans.put("&ni;", " contains "); //enth&auml;lt als Element
trans.put("&prod;", " product "); //Produkt
trans.put("&sum;", " sum "); //Summe
trans.put("&minus;", " minus "); //minus
trans.put("&lowast;", " times "); //Asterisk
trans.put("&radic;", " sqare root "); //Quadratwurzel
trans.put("&prop;", " proportional to "); //proportional zu
trans.put("&infin;", " unlimited "); //unendlich
trans.put("&ang;", " angle "); //Winkel
trans.put("&and;", " and "); //und
trans.put("&or;", " or "); //oder
trans.put("&cap;", " "); //Schnittpunkt
trans.put("&cup;", " unity "); //Einheit
trans.put("&int;", " integral "); //Integral
trans.put("&there4;", " cause "); //deshalb
trans.put("&sim;", " similar to "); //&auml;hnlich wie
trans.put("&cong;", " equal "); //ann&auml;hernd gleich
trans.put("&asymp;", " equal "); //beinahe gleich
trans.put("&ne;", " not equal "); //ungleich
trans.put("&equiv;", " identical "); //identisch mit
trans.put("&le;", " smaller or equal than "); //kleiner gleich
trans.put("&ge;", " greater or equal than "); //gr&ouml;&szlig;er gleich
trans.put("&sub;", " subset of "); //Untermenge von
trans.put("&sup;", " superset of "); //Obermenge von
trans.put("&nsub;", " not subset of "); //keine Untermenge von
trans.put("&sube;", ""); //Untermenge von oder gleich mit
trans.put("&supe;", ""); //Obermenge von oder gleich mit
trans.put("&oplus;", ""); //Direktsumme
trans.put("&otimes;", ""); //Vektorprodukt
trans.put("&perp;", ""); //senkrecht zu
trans.put("&sdot;", ""); //Punkt-Operator
trans.put("&loz;", ""); //Raute
trans.put("&lceil;", ""); //links oben
trans.put("&rceil;", ""); //rechts oben
trans.put("&lfloor;", ""); //links unten
trans.put("&rfloor;", ""); //rechts unten
trans.put("&lang;", ""); //spitze Klammer links
trans.put("&rang;", ""); //spitze Klammer rechts
trans.put("&larr;", ""); //Pfeil links
trans.put("&uarr;", ""); //Pfeil oben
trans.put("&rarr;", ""); //Pfeil rechts
trans.put("&darr;", ""); //Pfeil unten
trans.put("&harr;", ""); //Pfeil links/rechts
trans.put("&crarr;", ""); //Pfeil unten-Knick-links
trans.put("&lArr;", ""); //Doppelpfeil links
trans.put("&uArr;", ""); //Doppelpfeil oben
trans.put("&rArr;", ""); //Doppelpfeil rechts
trans.put("&dArr;", ""); //Doppelpfeil unten
trans.put("&hArr;", ""); //Doppelpfeil links/rechts
trans.put("&bull;", ""); //Bullet-Zeichen
trans.put("&hellip;", ""); //Horizontale Ellipse
trans.put("&prime;", ""); //Minutenzeichen
trans.put("&oline;", ""); //&Uuml;berstrich
trans.put("&frasl;", ""); //Bruchstrich
trans.put("&weierp;", ""); //Weierstrass p
trans.put("&image;", ""); //Zeichen f&uuml;r &quot;imagin&auml;r&quot;
trans.put("&real;", ""); //Zeichen f&uuml;r &quot;real&quot;
trans.put("&trade;", ""); //Trademark-Zeichen
trans.put("&euro;", ""); //Euro-Zeichen
trans.put("&alefsym;", ""); //Alef-Symbol
trans.put("&spades;", ""); //Pik-Zeichen
trans.put("&clubs;", ""); //Kreuz-Zeichen
trans.put("&hearts;", ""); //Herz-Zeichen
trans.put("&diams;", ""); //Karo-Zeichen
trans.put("&ensp;", ""); //Leerzeichen Breite n
trans.put("&emsp;", ""); //Leerzeichen Breite m
trans.put("&thinsp;", ""); //Schmales Leerzeichen
trans.put("&zwnj;", ""); //null breiter Nichtverbinder
trans.put("&zwj;", ""); //null breiter Verbinder
trans.put("&lrm;", ""); //links-nach-rechts-Zeichen
trans.put("&rlm;", ""); //rechts-nach-links-Zeichen
trans.put("&ndash;", ""); //Gedankenstrich Breite n
trans.put("&mdash;", ""); //Gedankenstrich Breite m
trans.put("&lsquo;", ""); //einfaches Anf&uuml;hrungszeichen links
trans.put("&rsquo;", ""); //einfaches Anf&uuml;hrungszeichen rechts
trans.put("&sbquo;", ""); //einfaches low-9-Zeichen
trans.put("&ldquo;", ""); //doppeltes Anf&uuml;hrungszeichen links
trans.put("&rdquo;", ""); //doppeltes Anf&uuml;hrungszeichen rechts
trans.put("&bdquo;", ""); //doppeltes low-9-Zeichen rechts
trans.put("&dagger;", ""); //Kreuz
trans.put("&Dagger;", ""); //Doppelkreuz
trans.put("&permil;", ""); //zu tausend
trans.put("&lsaquo;", ""); //angewinkeltes einzelnes Anf.zeichen links
trans.put("&rsaquo;", ""); //angewinkeltes einzelnes Anf.zeichen rechts
}
public htmlFilterAbstractScraper(HashSet tags0, HashSet tags1) {
this.tags0 = tags0;
this.tags1 = tags1;
@ -55,9 +305,6 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
//the 'missing' method that shall be implemented:
public abstract void scrapeText(byte[] text);
/* could be easily implemented as:
{ }
*/
// the other methods must take into account to construct the return value correctly
public void scrapeTag0(String tagname, Properties tagopts) {
@ -66,25 +313,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
}
protected static serverByteBuffer stripAllTags(serverByteBuffer bb) {
int p0, p1;
while ((p0 = bb.indexOf(lb)) >= 0) {
p1 = bb.indexOf(rb, p0);
if (p1 >= 0) {
bb = new serverByteBuffer(bb.getBytes(0, p0)).trim().append((byte) 32).append(new serverByteBuffer(bb.getBytes(p1 + 1)).trim());
} else {
bb = new serverByteBuffer(bb.getBytes(0, p0)).trim().append(new serverByteBuffer(bb.getBytes(p0 + 1)).trim());
}
}
return bb.trim();
}
// string conversions
private static serverByteBuffer code_iso8859(byte c) {
String s = code_iso8859s(c);
if (s == null) return null; else return new serverByteBuffer(s.getBytes());
}
private static String code_iso8859s(byte c) {
switch ((int) c & 0xff) {
@ -127,279 +356,51 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
}
public static serverByteBuffer convertUmlaute(serverByteBuffer bb) {
serverByteBuffer t = new serverByteBuffer();
serverByteBuffer z;
serverByteBuffer t = new serverByteBuffer(bb.length() + 20);
byte b;
String z;
for (int i = 0; i < bb.length(); i++) {
z = code_iso8859(bb.byteAt(i));
t.append((z == null) ? (new serverByteBuffer().append(bb.byteAt(i))) : z);
b = bb.byteAt(i);
z = code_iso8859s(b);
if (z == null) t.append(b); else t.append(z);
}
return t;
}
private static String transscripts(String code) {
if (code.equals("&quot;")) return "\""; //Anf&uuml;hrungszeichen oben
if (code.equals("&amp;")) return "&"; //Ampersand-Zeichen, kaufm&auml;nnisches Und
if (code.equals("&lt;")) return "<"; //&ouml;ffnende spitze Klammer
if (code.equals("&gt;")) return ">"; //schlie&szlig;ende spitze Klammer
if (code.equals("&nbsp;")) return " "; //Erzwungenes Leerzeichen
if (code.equals("&iexcl;")) return "!"; //umgekehrtes Ausrufezeichen
if (code.equals("&cent;")) return " cent "; //Cent-Zeichen
if (code.equals("&pound;")) return " pound "; //Pfund-Zeichen
if (code.equals("&curren;")) return " currency "; //W&auml;hrungs-Zeichen
if (code.equals("&yen;")) return " yen "; //Yen-Zeichen
if (code.equals("&brvbar;")) return " "; //durchbrochener Strich
if (code.equals("&sect;")) return " paragraph "; //Paragraph-Zeichen
if (code.equals("&uml;")) return " "; //P&uuml;nktchen oben
if (code.equals("&copy;")) return " copyright "; //Copyright-Zeichen
if (code.equals("&ordf;")) return " "; //Ordinal-Zeichen weiblich
if (code.equals("&laquo;")) return " "; //angewinkelte Anf&uuml;hrungszeichen links
if (code.equals("&not;")) return " not "; //Verneinungs-Zeichen
if (code.equals("&shy;")) return "-"; //kurzer Trennstrich
if (code.equals("&reg;")) return " trademark "; //Registriermarke-Zeichen
if (code.equals("&macr;")) return " "; //&Uuml;berstrich
if (code.equals("&deg;")) return " degree "; //Grad-Zeichen
if (code.equals("&plusmn;")) return " +/- "; //Plusminus-Zeichen
if (code.equals("&sup2;")) return " square "; //Hoch-2-Zeichen
if (code.equals("&sup3;")) return " 3 "; //Hoch-3-Zeichen
if (code.equals("&acute;")) return " "; //Acute-Zeichen
if (code.equals("&micro;")) return " micro "; //Mikro-Zeichen
if (code.equals("&para;")) return " paragraph "; //Absatz-Zeichen
if (code.equals("&middot;")) return " "; //Mittelpunkt
if (code.equals("&cedil;")) return " "; //H&auml;kchen unten
if (code.equals("&sup1;")) return " "; //Hoch-1-Zeichen
if (code.equals("&ordm;")) return " degree "; //Ordinal-Zeichen m&auml;nnlich
if (code.equals("&raquo;")) return " "; //angewinkelte Anf&uuml;hrungszeichen rechts
if (code.equals("&frac14;")) return " quarter "; //ein Viertel
if (code.equals("&frac12;")) return " half "; //ein Halb
if (code.equals("&frac34;")) return " 3/4 "; //drei Viertel
if (code.equals("&iquest;")) return "?"; //umgekehrtes Fragezeichen
if (code.equals("&Agrave;")) return "A"; //A mit Accent grave
if (code.equals("&Aacute;")) return "A"; //A mit Accent acute
if (code.equals("&Acirc;")) return "A"; //A mit Circumflex
if (code.equals("&Atilde;")) return "A"; //A mit Tilde
if (code.equals("&Auml;")) return "Ae"; //A Umlaut
if (code.equals("&Aring;")) return "A"; //A mit Ring
if (code.equals("&AElig;")) return "A"; //A mit legiertem E
if (code.equals("&Ccedil;")) return "C"; //C mit H&auml;kchen
if (code.equals("&Egrave;")) return "E"; //E mit Accent grave
if (code.equals("&Eacute;")) return "E"; //E mit Accent acute
if (code.equals("&Ecirc;")) return "E"; //E mit Circumflex
if (code.equals("&Euml;")) return "E"; //E Umlaut
if (code.equals("&Igrave;")) return "I"; //I mit Accent grave
if (code.equals("&Iacute;")) return "I"; //I mit Accent acute
if (code.equals("&Icirc;")) return "I"; //I mit Circumflex
if (code.equals("&Iuml;")) return "I"; //I Umlaut
if (code.equals("&ETH;")) return "D"; //Eth (isl&auml;ndisch)
if (code.equals("&Ntilde;")) return "N"; //N mit Tilde
if (code.equals("&Ograve;")) return "O"; //O mit Accent grave
if (code.equals("&Oacute;")) return "O"; //O mit Accent acute
if (code.equals("&Ocirc;")) return "O"; //O mit Circumflex
if (code.equals("&Otilde;")) return "O"; //O mit Tilde
if (code.equals("&Ouml;")) return "Oe"; //O Umlaut
if (code.equals("&times;")) return " times "; //Mal-Zeichen
if (code.equals("&Oslash;")) return "O"; //O mit Schr&auml;gstrich
if (code.equals("&Ugrave;")) return "U"; //U mit Accent grave
if (code.equals("&Uacute;")) return "U"; //U mit Accent acute
if (code.equals("&Ucirc;")) return "U"; //U mit Circumflex
if (code.equals("&Uuml;")) return "Ue"; //U Umlaut
if (code.equals("&Yacute;")) return "Y"; //Y mit Accent acute
if (code.equals("&THORN;")) return "P"; //THORN (isl&auml;ndisch)
if (code.equals("&szlig;")) return "ss"; //scharfes S
if (code.equals("&agrave;")) return "a"; //a mit Accent grave
if (code.equals("&aacute;")) return "a"; //a mit Accent acute
if (code.equals("&acirc;")) return "a"; //a mit Circumflex
if (code.equals("&atilde;")) return "a"; //a mit Tilde
if (code.equals("&auml;")) return "ae"; //a Umlaut
if (code.equals("&aring;")) return "a"; //a mit Ring
if (code.equals("&aelig;")) return "a"; //a mit legiertem e
if (code.equals("&ccedil;")) return "c"; //c mit H&auml;kchen
if (code.equals("&egrave;")) return "e"; //e mit Accent grave
if (code.equals("&eacute;")) return "e"; //e mit Accent acute
if (code.equals("&ecirc;")) return "e"; //e mit Circumflex
if (code.equals("&euml;")) return "e"; //e Umlaut
if (code.equals("&igrave;")) return "i"; //i mit Accent grave
if (code.equals("&iacute;")) return "i"; //i mit Accent acute
if (code.equals("&icirc;")) return "i"; //i mit Circumflex
if (code.equals("&iuml;")) return "i"; //i Umlaut
if (code.equals("&eth;")) return "d"; //eth (isl&auml;ndisch)
if (code.equals("&ntilde;")) return "n"; //n mit Tilde
if (code.equals("&ograve;")) return "o"; //o mit Accent grave
if (code.equals("&oacute;")) return "o"; //o mit Accent acute
if (code.equals("&ocirc;")) return "o"; //o mit Circumflex
if (code.equals("&otilde;")) return "o"; //o mit Tilde
if (code.equals("&ouml;")) return "oe"; //o Umlaut
if (code.equals("&divide;")) return "%"; //Divisions-Zeichen
if (code.equals("&oslash;")) return "o"; //o mit Schr&auml;gstrich
if (code.equals("&ugrave;")) return "u"; //u mit Accent grave
if (code.equals("&uacute;")) return "u"; //u mit Accent acute
if (code.equals("&ucirc;")) return "u"; //u mit Circumflex
if (code.equals("&uuml;")) return "ue"; //u Umlaut
if (code.equals("&yacute;")) return "y"; //y mit Accent acute
if (code.equals("&thorn;")) return "p"; //thorn (isl&auml;ndisch)
if (code.equals("&yuml;")) return "y"; //y Umlaut
if (code.equals("&Alpha;")) return " Alpha "; //Alpha gro&szlig;
if (code.equals("&alpha;")) return " alpha "; //alpha klein
if (code.equals("&Beta;")) return " Beta "; //Beta gro&szlig;
if (code.equals("&beta;")) return " beta "; //beta klein
if (code.equals("&Gamma;")) return " Gamma "; //Gamma gro&szlig;
if (code.equals("&gamma;")) return " gamma "; //gamma klein
if (code.equals("&Delta;")) return " Delta "; //Delta gro&szlig;
if (code.equals("&delta;")) return " delta "; //delta klein
if (code.equals("&Epsilon;")) return " Epsilon "; //Epsilon gro&szlig;
if (code.equals("&epsilon;")) return " epsilon "; //epsilon klein
if (code.equals("&Zeta;")) return " Zeta "; //Zeta gro&szlig;
if (code.equals("&zeta;")) return " zeta "; //zeta klein
if (code.equals("&Eta;")) return " Eta "; //Eta gro&szlig;
if (code.equals("&eta;")) return " eta "; //eta klein
if (code.equals("&Theta;")) return " Theta "; //Theta gro&szlig;
if (code.equals("&theta;")) return " theta "; //theta klein
if (code.equals("&Iota;")) return " Iota "; //Iota gro&szlig;
if (code.equals("&iota;")) return " iota "; //iota klein
if (code.equals("&Kappa;")) return " Kappa "; //Kappa gro&szlig;
if (code.equals("&kappa;")) return " kappa "; //kappa klein
if (code.equals("&Lambda;")) return " Lambda "; //Lambda gro&szlig;
if (code.equals("&lambda;")) return " lambda "; //lambda klein
if (code.equals("&Mu;")) return " Mu "; //Mu gro&szlig;
if (code.equals("&mu;")) return " mu "; //mu klein
if (code.equals("&Nu;")) return " Nu "; //Nu gro&szlig;
if (code.equals("&nu;")) return " nu "; //nu klein
if (code.equals("&Xi;")) return " Xi "; //Xi gro&szlig;
if (code.equals("&xi;")) return " xi "; //xi klein
if (code.equals("&Omicron;")) return " Omicron "; //Omicron gro&szlig;
if (code.equals("&omicron;")) return " omicron "; //omicron klein
if (code.equals("&Pi;")) return " Pi "; //Pi gro&szlig;
if (code.equals("&pi;")) return " pi "; //pi klein
if (code.equals("&Rho;")) return " Rho "; //Rho gro&szlig;
if (code.equals("&rho;")) return " rho "; //rho klein
if (code.equals("&Sigma;")) return " Sigma "; //Sigma gro&szlig;
if (code.equals("&sigmaf;")) return " sigma "; //sigmaf klein
if (code.equals("&sigma;")) return " sigma "; //sigma klein
if (code.equals("&Tau;")) return " Tau "; //Tau gro&szlig;
if (code.equals("&tau;")) return " tau "; //tau klein
if (code.equals("&Upsilon;")) return " Ypsilon "; //Upsilon gro&szlig;
if (code.equals("&upsilon;")) return " ypsilon "; //upsilon klein
if (code.equals("&Phi;")) return " Phi "; //Phi gro&szlig;
if (code.equals("&phi;")) return " phi "; //phi klein
if (code.equals("&Chi;")) return " Chi "; //Chi gro&szlig;
if (code.equals("&chi;")) return " chi "; //chi klein
if (code.equals("&Psi;")) return " Psi "; //Psi gro&szlig;
if (code.equals("&psi;")) return " psi "; //psi klein
if (code.equals("&Omega;")) return " Omega "; //Omega gro&szlig;
if (code.equals("&omega;")) return " omega "; //omega klein
if (code.equals("&thetasym;")) return " theta "; //theta Symbol
if (code.equals("&upsih;")) return " ypsilon "; //upsilon mit Haken
if (code.equals("&piv;")) return " pi "; //pi Symbol
if (code.equals("&forall;")) return " for all "; //f&uuml;r alle
if (code.equals("&part;")) return " part of "; //teilweise
if (code.equals("&exist;")) return " exists "; //existiert
if (code.equals("&empty;")) return " null "; //leer
if (code.equals("&nabla;")) return " nabla "; //nabla
if (code.equals("&isin;")) return " element of "; //Element von
if (code.equals("&notin;")) return " not element of "; //kein Element von
if (code.equals("&ni;")) return " contains "; //enth&auml;lt als Element
if (code.equals("&prod;")) return " product "; //Produkt
if (code.equals("&sum;")) return " sum "; //Summe
if (code.equals("&minus;")) return " minus "; //minus
if (code.equals("&lowast;")) return " times "; //Asterisk
if (code.equals("&radic;")) return " sqare root "; //Quadratwurzel
if (code.equals("&prop;")) return " proportional to "; //proportional zu
if (code.equals("&infin;")) return " unlimited "; //unendlich
if (code.equals("&ang;")) return " angle "; //Winkel
if (code.equals("&and;")) return " and "; //und
if (code.equals("&or;")) return " or "; //oder
if (code.equals("&cap;")) return " "; //Schnittpunkt
if (code.equals("&cup;")) return " unity "; //Einheit
if (code.equals("&int;")) return " integral "; //Integral
if (code.equals("&there4;")) return " cause "; //deshalb
if (code.equals("&sim;")) return " similar to "; //&auml;hnlich wie
if (code.equals("&cong;")) return " equal "; //ann&auml;hernd gleich
if (code.equals("&asymp;")) return " equal "; //beinahe gleich
if (code.equals("&ne;")) return " not equal "; //ungleich
if (code.equals("&equiv;")) return " identical "; //identisch mit
if (code.equals("&le;")) return " smaller or equal than "; //kleiner gleich
if (code.equals("&ge;")) return " greater or equal than "; //gr&ouml;&szlig;er gleich
if (code.equals("&sub;")) return " subset of "; //Untermenge von
if (code.equals("&sup;")) return " superset of "; //Obermenge von
if (code.equals("&nsub;")) return " not subset of "; //keine Untermenge von
if (code.equals("&sube;")) return ""; //Untermenge von oder gleich mit
if (code.equals("&supe;")) return ""; //Obermenge von oder gleich mit
if (code.equals("&oplus;")) return ""; //Direktsumme
if (code.equals("&otimes;")) return ""; //Vektorprodukt
if (code.equals("&perp;")) return ""; //senkrecht zu
if (code.equals("&sdot;")) return ""; //Punkt-Operator
if (code.equals("&loz;")) return ""; //Raute
if (code.equals("&lceil;")) return ""; //links oben
if (code.equals("&rceil;")) return ""; //rechts oben
if (code.equals("&lfloor;")) return ""; //links unten
if (code.equals("&rfloor;")) return ""; //rechts unten
if (code.equals("&lang;")) return ""; //spitze Klammer links
if (code.equals("&rang;")) return ""; //spitze Klammer rechts
if (code.equals("&larr;")) return ""; //Pfeil links
if (code.equals("&uarr;")) return ""; //Pfeil oben
if (code.equals("&rarr;")) return ""; //Pfeil rechts
if (code.equals("&darr;")) return ""; //Pfeil unten
if (code.equals("&harr;")) return ""; //Pfeil links/rechts
if (code.equals("&crarr;")) return ""; //Pfeil unten-Knick-links
if (code.equals("&lArr;")) return ""; //Doppelpfeil links
if (code.equals("&uArr;")) return ""; //Doppelpfeil oben
if (code.equals("&rArr;")) return ""; //Doppelpfeil rechts
if (code.equals("&dArr;")) return ""; //Doppelpfeil unten
if (code.equals("&hArr;")) return ""; //Doppelpfeil links/rechts
if (code.equals("&bull;")) return ""; //Bullet-Zeichen
if (code.equals("&hellip;")) return ""; //Horizontale Ellipse
if (code.equals("&prime;")) return ""; //Minutenzeichen
if (code.equals("&oline;")) return ""; //&Uuml;berstrich
if (code.equals("&frasl;")) return ""; //Bruchstrich
if (code.equals("&weierp;")) return ""; //Weierstrass p
if (code.equals("&image;")) return ""; //Zeichen f&uuml;r &quot;imagin&auml;r&quot;
if (code.equals("&real;")) return ""; //Zeichen f&uuml;r &quot;real&quot;
if (code.equals("&trade;")) return ""; //Trademark-Zeichen
if (code.equals("&euro;")) return ""; //Euro-Zeichen
if (code.equals("&alefsym;")) return ""; //Alef-Symbol
if (code.equals("&spades;")) return ""; //Pik-Zeichen
if (code.equals("&clubs;")) return ""; //Kreuz-Zeichen
if (code.equals("&hearts;")) return ""; //Herz-Zeichen
if (code.equals("&diams;")) return ""; //Karo-Zeichen
if (code.equals("&ensp;")) return ""; //Leerzeichen Breite n
if (code.equals("&emsp;")) return ""; //Leerzeichen Breite m
if (code.equals("&thinsp;")) return ""; //Schmales Leerzeichen
if (code.equals("&zwnj;")) return ""; //null breiter Nichtverbinder
if (code.equals("&zwj;")) return ""; //null breiter Verbinder
if (code.equals("&lrm;")) return ""; //links-nach-rechts-Zeichen
if (code.equals("&rlm;")) return ""; //rechts-nach-links-Zeichen
if (code.equals("&ndash;")) return ""; //Gedankenstrich Breite n
if (code.equals("&mdash;")) return ""; //Gedankenstrich Breite m
if (code.equals("&lsquo;")) return ""; //einfaches Anf&uuml;hrungszeichen links
if (code.equals("&rsquo;")) return ""; //einfaches Anf&uuml;hrungszeichen rechts
if (code.equals("&sbquo;")) return ""; //einfaches low-9-Zeichen
if (code.equals("&ldquo;")) return ""; //doppeltes Anf&uuml;hrungszeichen links
if (code.equals("&rdquo;")) return ""; //doppeltes Anf&uuml;hrungszeichen rechts
if (code.equals("&bdquo;")) return ""; //doppeltes low-9-Zeichen rechts
if (code.equals("&dagger;")) return ""; //Kreuz
if (code.equals("&Dagger;")) return ""; //Doppelkreuz
if (code.equals("&permil;")) return ""; //zu tausend
if (code.equals("&lsaquo;")) return ""; //angewinkeltes einzelnes Anf.zeichen links
if (code.equals("&rsaquo;")) return ""; //angewinkeltes einzelnes Anf.zeichen rechts
return "";
}
private static byte[] transscript(byte[] code) {
return transscripts(new String(code)).getBytes();
String t = (String) trans.get(new String(code));
if (t == null) return new byte[0]; else return t.getBytes();
}
protected static serverByteBuffer transscriptAll(serverByteBuffer bb) {
int p0, p1;
while ((p0 = bb.indexOf((byte) '&')) >= 0) {
int p0 = 0, p1;
byte[] t;
while ((p0 = bb.indexOf((byte) '&', p0)) >= 0) {
p1 = bb.indexOf((byte) ';', p0);
if (p1 >= 0)
bb = new serverByteBuffer(bb.getBytes(0, p0)).append(transscript(bb.getBytes(p0, p1 + 1))).append(bb.getBytes(p1 + 1));
else
bb = new serverByteBuffer(bb.getBytes(0, p0)).append(bb.getBytes(p0 + 1));
if (p1 >= 0) {
t = transscript(bb.getBytes(p0, p1 + 1));
bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getBytes(p1 + 1));
} else {
bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).append(bb.getBytes(p0 + 1));
}
}
t = null;
return bb;
}
protected static serverByteBuffer stripAllTags(serverByteBuffer bb) {
int p0 = 0, p1;
while ((p0 = bb.indexOf(lb, p0)) >= 0) {
p1 = bb.indexOf(rb, p0);
if (p1 >= 0) {
bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + 1).trim().append((byte) 32).append(new serverByteBuffer(bb.getBytes(p1 + 1)).trim());
} else {
bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).trim().append(new serverByteBuffer(bb.getBytes(p0 + 1)).trim());
}
}
return bb.trim();
}
public static serverByteBuffer stripAll(serverByteBuffer bb) {
//return stripAllTags(s);
return convertUmlaute(transscriptAll(stripAllTags(bb)));

@ -83,14 +83,13 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
this.images = new HashMap();
this.title = "";
this.headline = "";
this.text = new serverByteBuffer();
this.text = new serverByteBuffer(1024);
}
public void scrapeText(byte[] newtext) {
//System.out.println("SCRAPE: " + new String(newtext));
if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append((byte) 32);
text.append(new serverByteBuffer(super.stripAll(new serverByteBuffer(newtext))).trim()).append((byte) ' ');
if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append(32);
text.append(super.stripAll(new serverByteBuffer(newtext, newtext.length + 1)).trim()).append(32);
}
public static String urlNormalform(URL url) {
@ -122,12 +121,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
//System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
//if (tagname.equals("a")) anchors.put(absolutePath(tagopts.getProperty("href", "")), new serverByteBuffer(super.stripAll(new serverByteBuffer(text)).getBytes()).trim().toString());
//if (tagname.equals("h1")) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
//if (tagname.equals("title")) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
if ((tagname.equals("a")) && (text.length < 2048)) anchors.put(absolutePath(tagopts.getProperty("href", "")), new serverByteBuffer(super.stripAll(new serverByteBuffer(text)).getBytes()).trim().toString());
if ((tagname.equals("h1")) && (text.length < 512)) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
if ((tagname.equals("title")) && (text.length < 512)) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
if ((tagname.equals("a")) && (text.length < 2048)) anchors.put(absolutePath(tagopts.getProperty("href", "")), super.stripAll(new serverByteBuffer(text)).trim().toString());
if ((tagname.equals("h1")) && (text.length < 512)) headline = super.stripAll(new serverByteBuffer(text)).toString();
if ((tagname.equals("title")) && (text.length < 512)) title = super.stripAll(new serverByteBuffer(text)).toString();
}
@ -138,7 +134,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
if (title.length() > 0) hl = title.trim();
else if (headline.length() > 0) hl = headline.trim();
else if (text.length() > 80) hl = new String(text.getBytes(), 0, 80).trim();
else hl = text.toString().trim();
else hl = text.trim().toString();
// clean the line: may contain too many funny symbols
for (int i = 0; i < hl.length(); i++)

@ -91,7 +91,7 @@ public final class htmlFilterOutputStream extends OutputStream {
this.out = out;
this.scraper = scraper;
this.transformer = transformer;
this.buffer = new serverByteBuffer();
this.buffer = new serverByteBuffer(1024);
this.filterTag = null;
this.filterOpts = null;
this.filterCont = null;
@ -105,7 +105,7 @@ public final class htmlFilterOutputStream extends OutputStream {
public static byte[] genTag0raw(String tagname, boolean opening, byte[] tagopts) {
serverByteBuffer bb = new serverByteBuffer();
serverByteBuffer bb = new serverByteBuffer(tagname.length() + tagopts.length + 3);
bb.append((byte) '<');
if (!(opening)) bb.append((byte) '/');
bb.append(tagname.getBytes());
@ -119,7 +119,7 @@ public final class htmlFilterOutputStream extends OutputStream {
}
public static byte[] genTag1raw(String tagname, byte[] tagopts, byte[] text) {
serverByteBuffer bb = new serverByteBuffer();
serverByteBuffer bb = new serverByteBuffer(2 * tagname.length() + tagopts.length + text.length + 5);
bb.append((byte) '<').append(tagname.getBytes());
if (tagopts.length > 0) {
//if (tagopts[0] == (byte) 32)
@ -132,22 +132,23 @@ public final class htmlFilterOutputStream extends OutputStream {
return bb.getBytes();
}
public static byte[] genTag0(String tagname, Properties tagopts, byte quotechar) {
serverByteBuffer bb = new serverByteBuffer().append((byte) '<').append(tagname.getBytes());
if (tagopts.size() != 0) bb = bb.append((byte) 32).append(genOpts(tagopts, quotechar));
byte[] tagoptsx = (tagopts.size() == 0) ? null : genOpts(tagopts, quotechar);
serverByteBuffer bb = new serverByteBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2).append((byte) '<').append(tagname.getBytes());
if (tagoptsx != null) bb = bb.append((byte) 32).append(tagoptsx);
bb = bb.append((byte) '>');
return bb.getBytes();
}
public static byte[] genTag1(String tagname, Properties tagopts, byte[] text, byte quotechar) {
return new serverByteBuffer(genTag0(tagname, tagopts, quotechar)).append(text).append(("</" + tagname + ">").getBytes()).getBytes();
byte[] gt0 = genTag0(tagname, tagopts, quotechar);
return new serverByteBuffer(gt0, gt0.length + text.length + tagname.length() + 3).append(text).append(("</" + tagname + ">").getBytes()).getBytes();
}
// a helper method for pretty-printing of properties for html tags
public static byte[] genOpts(Properties prop, byte quotechar) {
Enumeration e = prop.propertyNames();
serverByteBuffer bb = new serverByteBuffer();
serverByteBuffer bb = new serverByteBuffer(prop.size() * 40);
String key;
while (e.hasMoreElements()) {
key = (String) e.nextElement();

@ -164,7 +164,7 @@ public final class httpc {
* A reusable readline buffer
* @see serverByteBuffer
*/
final serverByteBuffer readLineBuffer = new serverByteBuffer();
final serverByteBuffer readLineBuffer = new serverByteBuffer(100);
public String toString() {
return (this.savedRemoteHost == null) ? "Disconnected" : "Connected to " + this.savedRemoteHost +

@ -1098,7 +1098,7 @@ public final class httpd implements serverHandler {
// building the stacktrace
if (stackTrace != null) {
serverByteBuffer errorMsg = new serverByteBuffer();
serverByteBuffer errorMsg = new serverByteBuffer(100);
errorMsg.append("Exception occurred:\r\n\r\n")
.append(stackTrace.toString())
.append("\r\n")

@ -604,19 +604,24 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// do a local crawl
plasmaCrawlNURL.entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE);
if (urlEntry.url() == null) return false;
String stats = "LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
if (urlEntry.url() == null) {
log.logError(stats + ": urlEntry.url() == null");
return true;
}
String profileHandle = urlEntry.profileHandle();
//System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle);
if (profile == null) {
log.logError("LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
return false;
log.logError(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
return true;
}
log.logDebug("LOCALCRAWL: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() +
", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter() +
", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false")));
return processLocalCrawling(urlEntry, profile);
processLocalCrawling(urlEntry, profile, stats);
return true;
}
public int limitCrawlTriggerJobSize() {
@ -629,7 +634,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
return false;
}
// if the server is busy, we do crawling more slowly
if (!(cacheManager.idle())) try {Thread.currentThread().sleep(2000);} catch (InterruptedException e) {}
//if (!(cacheManager.idle())) try {Thread.currentThread().sleep(2000);} catch (InterruptedException e) {}
// if crawling was paused we have to wait until we wer notified to continue
synchronized(this.crawlingPausedSync) {
@ -643,13 +648,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// start a global crawl, if possible
plasmaCrawlNURL.entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT);
if (urlEntry.url() == null) return true;
String stats = "REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
if (urlEntry.url() == null) {
log.logError(stats + ": urlEntry.url() == null");
return true;
}
String profileHandle = urlEntry.profileHandle();
//System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle);
if (profile == null) {
log.logError("REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
return false;
log.logError(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
return true;
}
log.logDebug("plasmaSwitchboard.limitCrawlTriggerJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() +
", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter() +
@ -679,7 +688,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
return false;
}
processLocalCrawling(urlEntry, profile);
processLocalCrawling(urlEntry, profile, stats);
return true;
}
@ -710,7 +719,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
*/
// if the server is busy, we do this more slowly
if (!(cacheManager.idle())) try {Thread.currentThread().sleep(2000);} catch (InterruptedException e) {}
//if (!(cacheManager.idle())) try {Thread.currentThread().sleep(2000);} catch (InterruptedException e) {}
// if crawling was paused we have to wait until we wer notified to continue
synchronized(this.crawlingPausedSync) {
@ -724,19 +733,25 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// we don't want to crawl a global URL globally, since WE are the global part. (from this point of view)
plasmaCrawlNURL.entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE);
if (urlEntry.url() == null) return false;
String stats = "REMOTETRIGGEREDCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
if (urlEntry.url() == null) {
log.logError(stats + ": urlEntry.url() == null");
return false;
}
String profileHandle = urlEntry.profileHandle();
//System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle);
if (profile == null) {
log.logError("REMOTETRIGGEREDCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
log.logError(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
return false;
}
log.logDebug("plasmaSwitchboard.remoteTriggeredCrawlJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() +
", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter() +
", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false")));
return processLocalCrawling(urlEntry, profile);
processLocalCrawling(urlEntry, profile, stats);
return true;
}
private void processResourceStack(plasmaHTCache.Entry entry) {
@ -1000,21 +1015,22 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (u == null) return plasmaURL.dummyHash; else return u.toString();
}
private boolean processLocalCrawling(plasmaCrawlNURL.entry urlEntry, plasmaCrawlProfile.entry profile) {
private void processLocalCrawling(plasmaCrawlNURL.entry urlEntry, plasmaCrawlProfile.entry profile, String stats) {
// work off one Crawl stack entry
if ((urlEntry == null) && (urlEntry.url() == null)) {
log.logInfo("LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null");
return false;
log.logInfo(stats + ": urlEntry=null");
return;
}
cacheLoader.loadParallel(urlEntry.url(), urlEntry.referrerHash(), urlEntry.initiator(), urlEntry.depth(), profile);
log.logInfo("LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: enqueued for load " + urlEntry.url());
return true;
log.logInfo(stats + ": enqueued for load " + urlEntry.url());
return;
}
private boolean processRemoteCrawlTrigger(plasmaCrawlNURL.entry urlEntry) {
// return true iff another peer has/will index(ed) the url
if (urlEntry == null) {
log.logInfo("REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null");
return false;
return true; // superfluous request; true correct in this context
}
// are we qualified?
@ -1027,19 +1043,22 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// check url
if (urlEntry.url() == null) {
log.logDebug("ERROR: plasmaSwitchboard.processRemoteCrawlTrigger - url is null. name=" + urlEntry.name());
return false;
return true;
}
String nexturlString = urlEntry.url().toString();
String urlhash = plasmaURL.urlHash(urlEntry.url());
// check remote crawl
yacySeed remoteSeed = yacyCore.dhtAgent.getCrawlSeed(urlhash);
if (remoteSeed == null) {
log.logDebug("plasmaSwitchboard.processRemoteCrawlTrigger: no remote crawl seed available");
return false;
return false;
}
// do the request
HashMap page = yacyClient.crawlOrder(remoteSeed, nexturlString, hash2urlstring(urlEntry.referrerHash()), 0);
// check success
/*
@ -1060,7 +1079,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
*/
if ((page == null) || (page.get("delay") == null)) {
log.logInfo("CRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CAUSE: unknown (URL=" + nexturlString + ")");
yacyCore.peerActions.peerDeparture(remoteSeed);
if (remoteSeed != null) yacyCore.peerActions.peerDeparture(remoteSeed);
return false;
} else try {
log.logDebug("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed=" + remoteSeed.getName() + ", url=" + nexturlString + ", response=" + page.toString()); // DEBUG
@ -1093,7 +1112,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
e.printStackTrace();
return false;
}
}

@ -57,12 +57,13 @@ public final class serverByteBuffer extends OutputStream {
private int offset;
private int length;
public serverByteBuffer() {
buffer = new byte[80];
length = 0;
offset = 0;
}
public serverByteBuffer(int initLength) {
this.buffer = new byte[initLength];
this.length = 0;
@ -75,6 +76,13 @@ public final class serverByteBuffer extends OutputStream {
offset = 0;
}
public serverByteBuffer(byte[] bb, int initLength) {
this.buffer = new byte[initLength];
System.arraycopy(bb, 0, buffer, 0, bb.length);
length = bb.length;
offset = 0;
}
public serverByteBuffer(byte[] bb, int of, int le) {
if (of * 2 > bb.length) {
buffer = new byte[le];
@ -123,7 +131,9 @@ public final class serverByteBuffer extends OutputStream {
}
private void grow() {
byte[] tmp = new byte[buffer.length * 2 + 1];
int newsize = buffer.length * 2 + 1;
if (newsize < 256) newsize = 256;
byte[] tmp = new byte[newsize];
System.arraycopy(buffer, offset, tmp, 0, length);
buffer = tmp;
tmp = null;
@ -154,6 +164,11 @@ public final class serverByteBuffer extends OutputStream {
return this;
}
public serverByteBuffer append(int i) {
write((byte) (i & 0xFF));
return this;
}
public serverByteBuffer append(byte[] bb) {
write(bb);
return this;
@ -237,7 +252,7 @@ public final class serverByteBuffer extends OutputStream {
}
public String toString() {
return new String(getBytes());
return new String(getBytes(), offset, length);
}
public Properties propParser() {

@ -227,6 +227,30 @@ public final class serverDate {
return testSFormatter.format(gregorian.getTime());
}
public static String intervalToString(long millis) {
try {
long mins = millis / 60000;
StringBuffer uptime = new StringBuffer();
int uptimeDays = (int) (Math.floor(mins/1440));
int uptimeHours = (int) (Math.floor(mins/60)%24);
int uptimeMins = (int) mins%60;
uptime.append(uptimeDays)
.append(((uptimeDays == 1)?" day ":" days "))
.append((uptimeHours < 10)?"0":"")
.append(uptimeHours)
.append(":")
.append((uptimeMins < 10)?"0":"")
.append(uptimeMins);
return uptime.toString();
} catch (Exception e) {
return "unknown";
}
}
public static void main(String[] args) {
//System.out.println("kelondroDate is (" + new kelondroDate().toString() + ")");
System.out.println("serverDate : " + new serverDate().toShortString(false));

Loading…
Cancel
Save