diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java
index ddc520540..111169f3a 100644
--- a/htroot/Bookmarks.java
+++ b/htroot/Bookmarks.java
@@ -149,15 +149,15 @@ public class Bookmarks {
// try to get the bookmark from the LURL database
plasmaCrawlLURLEntry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null);
plasmaParserDocument document = null;
- if(urlentry != null){
- document = switchboard.snippetCache.retrieveDocument(urlentry.url(), true);
- }
if (urlentry != null) {
+ plasmaCrawlLURLEntry.Components comp = urlentry.comp();
+ document = switchboard.snippetCache.retrieveDocument(comp.url(), true);
prop.put("mode_edit", 0); // create mode
- prop.put("mode_title", urlentry.descr());
- prop.put("mode_description", (document == null) ? urlentry.descr() : document.getMainLongTitle());
- prop.put("mode_url", urlentry.url());
- prop.put("mode_tags", (document == null) ? "" : document.getKeywords(','));
+ prop.put("mode_url", comp.url().toNormalform());
+ prop.put("mode_title", comp.descr());
+ prop.put("mode_description", (document == null) ? comp.descr(): document.getMainLongTitle());
+ prop.put("mode_author", comp.author());
+ prop.put("mode_tags", (document == null) ? comp.tags() : document.getKeywords(','));
prop.put("mode_public", 0);
}
if (document != null) document.close();
diff --git a/htroot/IndexControl_p.html b/htroot/IndexControl_p.html
index ada516dc4..47fc1564d 100644
--- a/htroot/IndexControl_p.html
+++ b/htroot/IndexControl_p.html
@@ -164,9 +164,6 @@
Loaded-Date | #[loaddate]# |
Referrer | #[referrer]# |
Doctype | #[doctype]# |
- Copy-Count | #[copyCount]# |
- Local-Flag | #[local]# |
- Quality | #[quality]# |
Language | #[language]# |
Size | #[size]# |
Words | #[wordCount]# |
diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java
index c1c4381aa..73d44636f 100644
--- a/htroot/IndexControl_p.java
+++ b/htroot/IndexControl_p.java
@@ -222,8 +222,7 @@ public class IndexControl_p {
if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {
- URL url = entry.url();
- urlstring = url.toNormalform();
+ urlstring = entry.comp().url().toNormalform();
prop.put("urlstring", "");
switchboard.urlPool.loadedURL.remove(urlhash);
prop.put("result", "Removed URL " + urlstring);
@@ -339,9 +338,7 @@ public class IndexControl_p {
if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash);
} else {
- URL url = entry.url();
- urlstring = url.toString();
- prop.put("urlstring", urlstring);
+ prop.put("urlstring", entry.comp().url().toNormalform());
prop.putAll(genUrlProfile(switchboard, entry, urlhash));
}
}
@@ -410,30 +407,27 @@ public class IndexControl_p {
prop.put("genUrlProfile_urlhash", urlhash);
return prop;
}
- URL url = entry.url();
+ plasmaCrawlLURLEntry.Components comp = entry.comp();
String referrer = null;
plasmaCrawlLURLEntry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null);
if (le == null) {
referrer = "";
} else {
- referrer = le.url().toString();
+ referrer = le.comp().url().toNormalform();
}
- if (url == null) {
+ if (comp.url() == null) {
prop.put("genUrlProfile", 1);
prop.put("genUrlProfile_urlhash", urlhash);
return prop;
}
prop.put("genUrlProfile", 2);
- prop.put("genUrlProfile_urlNormalform", url.toNormalform());
+ prop.put("genUrlProfile_urlNormalform", comp.url().toNormalform());
prop.put("genUrlProfile_urlhash", urlhash);
- prop.put("genUrlProfile_urlDescr", entry.descr());
+ prop.put("genUrlProfile_urlDescr", comp.descr());
prop.put("genUrlProfile_moddate", entry.moddate());
prop.put("genUrlProfile_loaddate", entry.loaddate());
prop.put("genUrlProfile_referrer", referrer);
prop.put("genUrlProfile_doctype", ""+entry.doctype());
- prop.put("genUrlProfile_copyCount", entry.copyCount());
- prop.put("genUrlProfile_local", ""+entry.local());
- prop.put("genUrlProfile_quality", entry.quality());
prop.put("genUrlProfile_language", entry.language());
prop.put("genUrlProfile_size", entry.size());
prop.put("genUrlProfile_wordCount", entry.wordCount());
@@ -467,7 +461,7 @@ public class IndexControl_p {
if (le == null) {
tm.put(uh[0], uh);
} else {
- us = le.url().toString();
+ us = le.comp().url().toNormalform();
tm.put(us, uh);
}
diff --git a/htroot/PerformanceMemory_p.java b/htroot/PerformanceMemory_p.java
index 07e689a21..b880900a8 100644
--- a/htroot/PerformanceMemory_p.java
+++ b/htroot/PerformanceMemory_p.java
@@ -337,6 +337,7 @@ public class PerformanceMemory_p {
}
private static void putprop(serverObjects prop, serverSwitch env, String wdb, String db, String set) {
+ if ((slt == null) || (ost == null)) return;
usd = chk * slt[1] + obj * ost[2] /*hit*/ + kelondroTree.cacheObjectMissSize * ost[3] /*miss*/;
bst = (((((long) chk) * ((long) req)) >> 10) + 1) << 10;
if (set.equals("setBest")) env.setConfig("ramCache" + db, bst);
diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index ced7a6386..7302d7465 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -54,7 +54,6 @@ import java.util.Enumeration;
import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
-import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
@@ -116,12 +115,12 @@ public class ViewFile {
}
// gettin the url that belongs to the entry
- URL url = urlEntry.url();
- if (url == null) {
+ plasmaCrawlLURLEntry.Components comp = urlEntry.comp();
+ if ((comp == null) || (comp.url() == null)) {
prop.put("error",3);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
- }
+ }
// loading the resource content as byte array
InputStream resource = null;
@@ -130,14 +129,14 @@ public class ViewFile {
String resMime = null;
try {
// trying to load the resource body
- resource = sb.cacheManager.getResourceContentStream(url);
- resourceLength = sb.cacheManager.getResourceContentLength(url);
+ resource = sb.cacheManager.getResourceContentStream(comp.url());
+ resourceLength = sb.cacheManager.getResourceContentLength(comp.url());
// if the resource body was not cached we try to load it from web
if (resource == null) {
plasmaHTCache.Entry entry = null;
try {
- entry = sb.snippetCache.loadResourceFromWeb(url, 5000, false);
+ entry = sb.snippetCache.loadResourceFromWeb(comp.url(), 5000, false);
} catch (plasmaCrawlerException e) {
prop.put("error",4);
prop.put("error_errorText",e.getMessage());
@@ -147,8 +146,8 @@ public class ViewFile {
if (entry != null) {
resInfo = entry.getDocumentInfo();
- resource = sb.cacheManager.getResourceContentStream(url);
- resourceLength = sb.cacheManager.getResourceContentLength(url);
+ resource = sb.cacheManager.getResourceContentStream(comp.url());
+ resourceLength = sb.cacheManager.getResourceContentLength(comp.url());
}
if (resource == null) {
@@ -164,19 +163,19 @@ public class ViewFile {
// try to load the metadata from cache
try {
- resInfo = sb.cacheManager.loadResourceInfo(urlEntry.url());
+ resInfo = sb.cacheManager.loadResourceInfo(comp.url());
} catch (Exception e) { /* ignore this */}
// if the metadata where not cached try to load it from web
if (resInfo == null) {
- String protocol = url.getProtocol();
+ String protocol = comp.url().getProtocol();
if (!((protocol.equals("http") || protocol.equals("https")))) {
prop.put("error",6);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
- httpHeader responseHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig);
+ httpHeader responseHeader = httpc.whead(comp.url(),comp.url().getHost(),5000,null,null,sb.remoteProxyConfig);
if (responseHeader == null) {
prop.put("error",4);
prop.put("error_errorText","Unable to load resource metadata.");
@@ -184,7 +183,7 @@ public class ViewFile {
return prop;
}
try {
- resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, responseHeader);
+ resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(comp.url(), responseHeader);
} catch (Exception e) {
prop.put("error",4);
prop.put("error_errorText",e.getMessage());
@@ -230,12 +229,12 @@ public class ViewFile {
prop.put("viewMode_plainText",content);
} else if (viewMode.equals("iframe")) {
prop.put("viewMode",VIEW_MODE_AS_IFRAME);
- prop.put("viewMode_url",url.toString());
+ prop.put("viewMode_url",comp.url().toNormalform());
} else if (viewMode.equals("parsed") || viewMode.equals("sentences")) {
// parsing the resource content
plasmaParserDocument document = null;
try {
- document = sb.snippetCache.parseDocument(url, resourceLength, resource,resInfo);
+ document = sb.snippetCache.parseDocument(comp.url(), resourceLength, resource,resInfo);
if (document == null) {
prop.put("error",5);
prop.put("error_errorText","Unknown error");
@@ -295,13 +294,13 @@ public class ViewFile {
}
if (document != null) document.close();
}
- prop.put("error",0);
- prop.put("error_url",url.toString());
- prop.put("error_hash",urlHash);
- prop.put("error_wordCount",Integer.toString(urlEntry.wordCount()));
- prop.put("error_desc",urlEntry.descr());
- prop.put("error_size",urlEntry.size());
- prop.put("error_mimeType",resMime);
+ prop.put("error", 0);
+ prop.put("error_url", comp.url().toNormalform());
+ prop.put("error_hash", urlHash);
+ prop.put("error_wordCount", Integer.toString(urlEntry.wordCount()));
+ prop.put("error_desc", comp.descr());
+ prop.put("error_size", urlEntry.size());
+ prop.put("error_mimeType", resMime);
return prop;
}
diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java
index b85223a75..cf29c69e7 100644
--- a/htroot/htdocsdefault/dir.java
+++ b/htroot/htdocsdefault/dir.java
@@ -359,7 +359,7 @@ public class dir {
final URL url = new URL(urlstring);
final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()));
final plasmaCrawlLURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry(
- url, "YaCyShare: " + descr, new Date(), new Date(),
+ url.toNormalform(), "YaCyShare: " + descr, new Date(), new Date(),
"AAAAAAAAAAAA", /*referrer*/
0, /*copycount*/
false, /*localneed*/
diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java
index eb29cf5f6..57d295c29 100644
--- a/htroot/yacy/crawlReceipt.java
+++ b/htroot/yacy/crawlReceipt.java
@@ -125,8 +125,13 @@ public final class crawlReceipt {
} else if (result.equals("fill")) {
// generating a new loaded URL entry
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr, true);
- if ((entry == null)||(entry.url()==null)) {
- log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT for hash " + receivedUrlhash + " from peer " + iam +
+ if (entry == null) {
+ log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) for hash " + receivedUrlhash + " from peer " + iam +
+ "\n\tURL properties: "+ propStr);
+ } else {
+ plasmaCrawlLURLEntry.Components comp = entry.comp();
+ if (comp.url() == null) {
+ log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + receivedUrlhash + " from peer " + iam +
"\n\tURL properties: "+ propStr);
} else try {
// put new entry into database
@@ -134,18 +139,18 @@ public final class crawlReceipt {
switchboard.urlPool.loadedURL.stack(entry, youare, iam, 1);
// generating url hash
- String newUrlHash = indexURL.urlHash(entry.url());
- String oldUrlHash = indexURL.oldurlHash(entry.url());
+ String newUrlHash = indexURL.urlHash(comp.url());
+ String oldUrlHash = indexURL.oldurlHash(comp.url());
// removing URL from notice URL
switchboard.urlPool.noticeURL.remove(newUrlHash);
switchboard.urlPool.noticeURL.remove(oldUrlHash);
- log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + receivedUrlhash + ":" + entry.url());
+ log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + receivedUrlhash + ":" + comp.url().toNormalform());
} catch (IOException e) {
e.printStackTrace();
}
-
+ }
// ready for more
prop.put("delay", "10");
} else {
diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java
index 6f687d874..a30d41367 100644
--- a/htroot/yacy/search.java
+++ b/htroot/yacy/search.java
@@ -249,7 +249,7 @@ public final class search {
while ((acc.hasMoreElements()) && (i < squery.wantedResults)) {
urlentry = (plasmaCrawlLURLEntry) acc.nextElement();
if (includesnippet) {
- snippet = sb.snippetCache.retrieveSnippet(urlentry.url(), squery.queryHashes, false, 260, 1000);
+ snippet = sb.snippetCache.retrieveSnippet(urlentry.comp().url(), squery.queryHashes, false, 260, 1000);
} else {
snippet = null;
}
diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java
index 9ae72dfb7..ac551bc81 100644
--- a/htroot/yacy/transferURL.java
+++ b/htroot/yacy/transferURL.java
@@ -98,25 +98,29 @@ public final class transferURL {
yacyCore.log.logFine("transferURL: got null URL-string from peer " + otherPeerName);
} else {
lEntry = sb.urlPool.loadedURL.newEntry(urls, true);
- if ((lEntry != null) && (lEntry.url() != null)) {
- if ((blockBlacklist) &&
- (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), lEntry.url()))) {
- int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
- yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
- lEntry = null;
- blocked++;
- } else try {
- sb.urlPool.loadedURL.store(lEntry);
- sb.urlPool.loadedURL.stack(lEntry, iam, iam, 3);
- yacyCore.log.logFine("transferURL: received URL '" + lEntry.url() + "' from peer " + otherPeerName);
- received++;
- } catch (IOException e) {
- e.printStackTrace();
- }
- } else {
- yacyCore.log.logWarning("transferURL: received invalid URL from peer " + otherPeerName +
- "\n\tURL Property: " + urls);
+ if (lEntry == null) {
+ yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
// TODO: should we send back an error message???
+ } else {
+ plasmaCrawlLURLEntry.Components comp = lEntry.comp();
+ if (comp.url() == null) {
+ yacyCore.log.logWarning("transferURL: received invalid URL (url null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
+ // TODO: should we send back an error message???
+ } else {
+ if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), comp.url()))) {
+ int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
+ yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
+ lEntry = null;
+ blocked++;
+ } else try {
+ sb.urlPool.loadedURL.store(lEntry);
+ sb.urlPool.loadedURL.stack(lEntry, iam, iam, 3);
+ yacyCore.log.logFine("transferURL: received URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName);
+ received++;
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
}
}
}
diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java
index 71e7f8996..1721351fd 100644
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@@ -191,13 +191,15 @@ public class yacysearch {
final String recommendHash = post.get("recommendref", ""); // urlhash
plasmaCrawlLURLEntry urlentry = sb.urlPool.loadedURL.load(recommendHash, null);
if (urlentry != null) {
- plasmaParserDocument document = sb.snippetCache.retrieveDocument(urlentry.url(), true);
+ plasmaCrawlLURLEntry.Components comp = urlentry.comp();
+ plasmaParserDocument document;
+ document = sb.snippetCache.retrieveDocument(comp.url(), true);
if (document != null) {
// create a news message
HashMap map = new HashMap();
- map.put("url", urlentry.url().toNormalform().replace(',', '|'));
- map.put("title", urlentry.descr().replace(',', ' '));
- map.put("description", ((document == null) ? urlentry.descr() : document.getMainLongTitle()).replace(',', ' '));
+ map.put("url", comp.url().toNormalform().replace(',', '|'));
+ map.put("title", comp.descr().replace(',', ' '));
+ map.put("description", ((document == null) ? comp.descr() : document.getMainLongTitle()).replace(',', ' '));
map.put("tags", ((document == null) ? "" : document.getKeywords(' ')));
yacyCore.newsPool.publishMyNews(new yacyNewsRecord("stippadd", map));
document.close();
diff --git a/source/de/anomic/index/indexURL.java b/source/de/anomic/index/indexURL.java
index c92380fc5..41cbfda73 100644
--- a/source/de/anomic/index/indexURL.java
+++ b/source/de/anomic/index/indexURL.java
@@ -52,6 +52,7 @@ public class indexURL {
public static final int urlStringLength = 256;// not too short for links without parameters
public static final int urlDescrLength = 80; // The headline of a web page (meta-tag or )
public static final int urlNameLength = 40; // the tag content between and
+ public static final int urldescrtagsLength = 320;// the url, the description and tags in one string
public static final int urlErrorLength = 80; // a reason description for unavailable urls
public static final int urlDateLength = 4; // any date, shortened
public static final int urlCopyCountLength = 2; // counter for numbers of copies of this index
diff --git a/source/de/anomic/kelondro/kelondroCollectionIndex.java b/source/de/anomic/kelondro/kelondroCollectionIndex.java
index 2932f8d19..7a73b35eb 100644
--- a/source/de/anomic/kelondro/kelondroCollectionIndex.java
+++ b/source/de/anomic/kelondro/kelondroCollectionIndex.java
@@ -158,7 +158,10 @@ public class kelondroCollectionIndex {
ientry.setCol(idx_col_indexpos, j);
ientry.setCol(idx_col_lastread, t);
ientry.setCol(idx_col_lastwrote, t);
- index.put(ientry);
+ if (index instanceof kelondroBufferedIndex)
+ ((kelondroBufferedIndex) index).add(ientry);
+ else
+ index.put(ientry);
// write a log
if (System.currentTimeMillis() - lastlog > 30000) {
diff --git a/source/de/anomic/kelondro/kelondroRow.java b/source/de/anomic/kelondro/kelondroRow.java
index 846515795..f172aebcf 100644
--- a/source/de/anomic/kelondro/kelondroRow.java
+++ b/source/de/anomic/kelondro/kelondroRow.java
@@ -40,7 +40,7 @@ public class kelondroRow {
protected kelondroColumn[] row;
protected int[] colstart;
protected int objectsize;
- protected Map nickref = null;
+ protected Map nickref = null; // a mapping from nicknames to Object[2]{kelondroColumn, Integer(colstart)}
public kelondroRow(kelondroColumn[] row) {
this.row = row;
@@ -142,7 +142,12 @@ public class kelondroRow {
if (external == null) return null;
return new Entry(external);
}
-
+ /*
+ public Entry newEntry(Properties prop) {
+ if (prop == null) return null;
+ return new Entry(prop);
+ }
+ */
public class Entry implements Comparable {
private byte[] rowinstance;
@@ -202,7 +207,19 @@ public class kelondroRow {
}
}
}
-
+ /*
+ public Entry(Properties prop) {
+ // parse external form
+ if (nickref == null) genNickRef();
+ rowinstance = new byte[objectsize];
+ Iterator i = prop.entrySet().iterator();
+ Map.Entry entry;
+ while (i.hasNext()) {
+ entry = (Map.Entry) i.next();
+ setCol(((String) entry.getKey()).trim(), ((String) entry.getValue()).trim().getBytes());
+ }
+ }
+ */
public int compareTo(Object o) {
if (o instanceof Entry) {
return kelondroNaturalOrder.naturalOrder.compare(this.rowinstance, ((Entry) o).rowinstance);
@@ -354,7 +371,7 @@ public class kelondroRow {
return getColLong(row[column].encoder(), colstart[column], row[column].cellwidth());
}
- public long getColLong(int encoder, int offset, int length) {
+ private long getColLong(int encoder, int offset, int length) {
// start - fix for badly stored parameters
if ((length >= 3) && (rowinstance[offset] == '[') && (rowinstance[offset + 1] == 'B') && (rowinstance[offset + 2] == '@')) return 0;
if ((length == 2) && (rowinstance[offset] == '[') && (rowinstance[offset + 1] == 'B')) return 0;
@@ -378,6 +395,13 @@ public class kelondroRow {
throw new kelondroException("ROW", "getColLong did not find appropriate encoding");
}
+ public byte getColByte(String nickname, byte dflt) {
+ if (nickref == null) genNickRef();
+ Object[] ref = (Object[]) nickref.get(nickname);
+ if (ref == null) return dflt;
+ return rowinstance[((Integer) ref[1]).intValue()];
+ }
+
public byte getColByte(int column) {
return rowinstance[colstart[column]];
}
diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java
index 8ba925c57..52376ec02 100644
--- a/source/de/anomic/plasma/plasmaCrawlLURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlLURL.java
@@ -195,7 +195,7 @@ public final class plasmaCrawlLURL extends indexURL {
}
}
- public synchronized plasmaCrawlLURLEntry newEntry(URL url, String descr, Date moddate, Date loaddate,
+ public synchronized plasmaCrawlLURLEntry newEntry(String url, String descr, Date moddate, Date loaddate,
String referrerHash, int copyCount, boolean localNeed,
int quality, String language, char doctype,
int size, int wordCount) {
@@ -338,7 +338,6 @@ public final class plasmaCrawlLURL extends indexURL {
String cachepath, urlstr, urltxt;
yacySeed initiatorSeed, executorSeed;
plasmaCrawlLURLEntry urle;
- URL url;
// needed for getCachePath(url)
final plasmaSwitchboard switchboard = plasmaSwitchboard.getSwitchboard();
@@ -353,14 +352,14 @@ public final class plasmaCrawlLURL extends indexURL {
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash);
try {
urle = load(urlHash, null);
+ plasmaCrawlLURLEntry.Components comp = urle.comp();
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString());
initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash);
executorSeed = yacyCore.seedDB.getConnected(executorHash);
- url = urle.url();
- urlstr = url.toString();
+ urlstr = comp.url().toNormalform();
urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
- cachepath = (url == null) ? "-not-cached-" : cacheManager.getCachePath(url).toString().replace('\\', '/').substring(cacheManager.cachePath.toString().length() + 1);
+ cachepath = cacheManager.getCachePath(new URL(urlstr)).toString().replace('\\', '/').substring(cacheManager.cachePath.toString().length() + 1);
prop.put("table_indexed_" + cnt + "_dark", (dark) ? 1 : 0);
prop.put("table_indexed_" + cnt + "_feedbackpage", feedbackpage);
@@ -372,8 +371,8 @@ public final class plasmaCrawlLURL extends indexURL {
prop.put("table_indexed_" + cnt + "_showExec_executorSeed", (executorSeed == null) ? dfltExec : executorSeed.getName());
prop.put("table_indexed_" + cnt + "_moddate", daydate(urle.moddate()));
prop.put("table_indexed_" + cnt + "_wordcount", urle.wordCount());
- prop.put("table_indexed_" + cnt + "_urldescr", urle.descr());
- prop.put("table_indexed_" + cnt + "_url", (urle.url() == null) ? "-not-cached-" : ((makeLink) ? ("" + urltxt + "") : urlstr));
+ prop.put("table_indexed_" + cnt + "_urldescr", comp.descr());
+ prop.put("table_indexed_" + cnt + "_url", (cachepath == null) ? "-not-cached-" : ((makeLink) ? ("" + urltxt + "") : urlstr));
dark = !dark;
cnt++;
} catch (Exception e) {
@@ -535,18 +534,19 @@ public final class plasmaCrawlLURL extends indexURL {
}
plasmaCrawlLURLEntry entry = (plasmaCrawlLURLEntry) eiter.next();
+ plasmaCrawlLURLEntry.Components comp = entry.comp();
totalSearchedUrls++;
- if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, entry.url()) ||
- plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, entry.url())) {
- lastBlacklistedUrl = entry.url().toString();
+ if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, comp.url()) ||
+ plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, comp.url())) {
+ lastBlacklistedUrl = comp.url().toNormalform();
lastBlacklistedHash = entry.hash();
- serverLog.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double)blacklistedUrls/totalSearchedUrls)*100 + "%): " + entry.hash() + " " + entry.url());
+ serverLog.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double)blacklistedUrls/totalSearchedUrls)*100 + "%): " + entry.hash() + " " + comp.url().toNormalform());
remove(entry.hash());
if (blacklistedUrls % 100 == 0) {
serverLog.logInfo("URLDBCLEANER", "Deleted " + blacklistedUrls + " URLs until now. Last deleted URL-Hash: " + lastBlacklistedUrl);
}
}
- lastUrl = entry.url().toString();
+ lastUrl = comp.url().toNormalform();
lastHash = entry.hash();
}
} catch (RuntimeException e) {
@@ -605,7 +605,7 @@ public final class plasmaCrawlLURL extends indexURL {
final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), 1, 0, false);
final Iterator enu = urls.entries(true, false, null);
while (enu.hasNext()) {
- ((plasmaCrawlLURLEntry) enu.next()).print();
+ System.out.println(((plasmaCrawlLURLEntry) enu.next()).toString());
}
} catch (Exception e) {
e.printStackTrace();
diff --git a/source/de/anomic/plasma/plasmaCrawlLURLEntry.java b/source/de/anomic/plasma/plasmaCrawlLURLEntry.java
index b66c49c1c..18c859a6b 100644
--- a/source/de/anomic/plasma/plasmaCrawlLURLEntry.java
+++ b/source/de/anomic/plasma/plasmaCrawlLURLEntry.java
@@ -27,10 +27,11 @@
package de.anomic.plasma;
import java.io.IOException;
+import java.net.MalformedURLException;
import java.util.Date;
-import de.anomic.net.URL;
import de.anomic.kelondro.kelondroRow;
+import de.anomic.net.URL;
import de.anomic.index.indexEntry;
public interface plasmaCrawlLURLEntry {
@@ -39,9 +40,7 @@ public interface plasmaCrawlLURLEntry {
public String hash();
- public URL url();
-
- public String descr();
+ public Components comp();
public Date moddate();
@@ -51,12 +50,6 @@ public interface plasmaCrawlLURLEntry {
public char doctype();
- public int copyCount();
-
- public boolean local();
-
- public int quality();
-
public String language();
public int size();
@@ -73,6 +66,26 @@ public interface plasmaCrawlLURLEntry {
public String toString();
- public void print();
-
+ public class Components {
+ private URL url;
+ private String descr, author, tags, ETag;
+
+ public Components(String url, String descr, String author, String tags, String ETag) {
+ try {
+ this.url = new URL(url);
+ } catch (MalformedURLException e) {
+ this.url = null;
+ }
+ this.descr = descr;
+ this.author = author;
+ this.tags = tags;
+ this.ETag = ETag;
+ }
+ public URL url() { return this.url; }
+ public String descr() { return this.descr; }
+ public String author() { return this.author; }
+ public String tags() { return this.tags; }
+ public String ETag() { return this.ETag; }
+ }
+
}
diff --git a/source/de/anomic/plasma/plasmaCrawlLURLNewEntry.java b/source/de/anomic/plasma/plasmaCrawlLURLNewEntry.java
new file mode 100644
index 000000000..bd00fe8d2
--- /dev/null
+++ b/source/de/anomic/plasma/plasmaCrawlLURLNewEntry.java
@@ -0,0 +1,337 @@
+package de.anomic.plasma;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.text.ParseException;
+import java.util.Date;
+import java.util.Properties;
+import java.util.ArrayList;
+
+import de.anomic.index.indexEntry;
+import de.anomic.index.indexURL;
+import de.anomic.index.indexURLEntry;
+import de.anomic.kelondro.kelondroNaturalOrder;
+import de.anomic.kelondro.kelondroBase64Order;
+import de.anomic.kelondro.kelondroRow;
+import de.anomic.net.URL;
+import de.anomic.server.serverByteBuffer;
+import de.anomic.server.serverCodings;
+import de.anomic.tools.crypt;
+import de.anomic.tools.bitfield;
+import de.anomic.tools.nxTools;
+
+public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
+
+ public static final kelondroRow rowdef = new kelondroRow(
+ "String hash-12, " + // the url's hash
+ "String comp-360, " + // components: the url, description, author and tags. As 5th element, an ETag is possible
+ "Cardinal mod-4 {b256}, " + // last-modified from the httpd
+ "Cardinal load-4 {b256}, " + // time when the url was loaded
+ "String referrer-12, " + // (one of) the url's referrer hash(es)
+ "byte[] md5-8" + // the md5 of the url content (to identify changes)
+ "Cardinal size-6 {b256}, " + // size of file in bytes
+ "Cardinal wc-3 {b256}, " + // size of file by number of words; for video and audio: seconds
+ "byte[] dt-1, " + // doctype, taken from extension or any other heuristic
+ "byte[] flags-4, " + // flags; any stuff (see Word-Entity definition)
+ "String lang-2, " + // language
+ "Cardinal llocal-2 {b256}, " + // # of outlinks to same domain; for video and image: width
+ "Cardinal lother-2 {b256}, " + // # of outlinks to outside domain; for video and image: height
+ "Cardinal limage-2 {b256}, " + // # of embedded image links
+ "Cardinal laudio-2 {b256}, " + // # of embedded audio links; for audio: track number; for video: number of audio tracks
+ "Cardinal lvideo-2 {b256}, " + // # of embedded video links
+ "Cardinal lapp-2 {b256}"); // # of embedded links to applications
+
+ private kelondroRow.Entry entry;
+ private String snippet;
+ private indexEntry word; // this is only used if the url is transported via remote search requests
+
+ public plasmaCrawlLURLNewEntry(
+ URL url,
+ String descr,
+ String author,
+ String tags,
+ String ETag,
+ Date mod,
+ Date load,
+ String referrer,
+ byte[] md5,
+ long size,
+ int wc,
+ byte dt,
+ bitfield flags,
+ String lang,
+ int llocal,
+ int lother,
+ int laudio,
+ int limage,
+ int lvideo,
+ int lapp) {
+ // create new entry and store it into database
+ this.entry = rowdef.newEntry();
+ this.entry.setCol("hash", indexURL.urlHash(url), null);
+ this.entry.setCol("comp", encodeComp(url, descr, author, tags, ETag));
+ this.entry.setCol("mod", encodeDate(mod));
+ this.entry.setCol("load", encodeDate(load));
+ this.entry.setCol("referrer", referrer.getBytes());
+ this.entry.setCol("md5", md5);
+ this.entry.setCol("size", size);
+ this.entry.setCol("wc", wc);
+ this.entry.setCol("dt", dt);
+ this.entry.setCol("flags", flags.getBytes());
+ this.entry.setCol("lang", lang.getBytes());
+ this.entry.setCol("llocal", llocal);
+ this.entry.setCol("lother", lother);
+ this.entry.setCol("limage", limage);
+ this.entry.setCol("laudio", laudio);
+ this.entry.setCol("lvideo", lvideo);
+ this.entry.setCol("lapp", lapp);
+ this.snippet = null;
+ this.word = null;
+ }
+
+ byte[] encodeDate(Date d) {
+ return kelondroNaturalOrder.encodeLong(d.getTime() / 86400000, 4);
+ }
+
+ byte[] encodeComp(URL url, String descr, String author, String tags, String ETag) {
+ serverByteBuffer s = new serverByteBuffer(200);
+ s.append(url.toNormalform()).append((char) 10);
+ s.append(author).append((char) 10);
+ s.append(tags).append((char) 10);
+ s.append(ETag).append((char) 10);
+ return s.getBytes();
+ }
+
+ public plasmaCrawlLURLNewEntry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException {
+ this.entry = entry;
+ this.snippet = null;
+ this.word = searchedWord;
+ }
+
+ public plasmaCrawlLURLNewEntry(Properties prop, boolean setGlobal) throws IOException {
+ // generates an plasmaLURLEntry using the properties from the argument
+ // the property names must correspond to the one from toString
+ //System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
+ URL url;
+ try {
+ url = new URL(crypt.simpleDecode(prop.getProperty("url", ""), null));
+ } catch (MalformedURLException e) {
+ throw new IOException("URL is not proper: " + crypt.simpleDecode(prop.getProperty("url", ""), null));
+ }
+ String descr = crypt.simpleDecode(prop.getProperty("descr", ""), null); if (descr == null) descr = "";
+ String author = crypt.simpleDecode(prop.getProperty("author", ""), null); if (author == null) author = "";
+ String tags = crypt.simpleDecode(prop.getProperty("tags", ""), null); if (tags == null) tags = "";
+ String ETag = crypt.simpleDecode(prop.getProperty("ETag", ""), null); if (ETag == null) ETag = "";
+
+ this.entry = rowdef.newEntry();
+ this.entry.setCol("hash", indexURL.urlHash(url), null);
+ this.entry.setCol("comp", encodeComp(url, descr, author, tags, ETag));
+ try {
+ this.entry.setCol("mod", encodeDate(indexURL.shortDayFormatter.parse(prop.getProperty("mod", "20000101"))));
+ } catch (ParseException e) {
+ this.entry.setCol("mod", encodeDate(new Date()));
+ }
+ try {
+ this.entry.setCol("load", encodeDate(indexURL.shortDayFormatter.parse(prop.getProperty("load", "20000101"))));
+ } catch (ParseException e) {
+ this.entry.setCol("load", encodeDate(new Date()));
+ }
+ this.entry.setCol("referrer", prop.getProperty("referrer", indexURL.dummyHash).getBytes());
+ this.entry.setCol("md5", serverCodings.decodeHex(prop.getProperty("md5", indexURL.dummyHash)));
+ this.entry.setCol("size", Integer.parseInt(prop.getProperty("size", "0")));
+ this.entry.setCol("wc", Integer.parseInt(prop.getProperty("wc", "0")));
+ this.entry.setCol("dt", prop.getProperty("dt", "t").charAt(0));
+ this.entry.setCol("flags", serverCodings.decodeHex(prop.getProperty("flags", "00000000")));
+ this.entry.setCol("lang", prop.getProperty("lang", "uk").getBytes());
+ this.entry.setCol("llocal", Integer.parseInt(prop.getProperty("llocal", "0")));
+ this.entry.setCol("lother", Integer.parseInt(prop.getProperty("lother", "0")));
+ this.entry.setCol("limage", Integer.parseInt(prop.getProperty("limage", "0")));
+ this.entry.setCol("laudio", Integer.parseInt(prop.getProperty("laudio", "0")));
+ this.entry.setCol("lvideo", Integer.parseInt(prop.getProperty("lvideo", "0")));
+ this.entry.setCol("lapp", Integer.parseInt(prop.getProperty("lapp", "0")));
+ this.snippet = crypt.simpleDecode(prop.getProperty("snippet", ""), null);
+ this.word = (prop.containsKey("word")) ? new indexURLEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))) : null;
+ }
+
+ private StringBuffer corePropList() {
+ // generate a parseable string; this is a simple property-list
+ plasmaCrawlLURLEntry.Components comp = this.comp();
+ final StringBuffer s = new StringBuffer(300);
+ try {
+ s.append("hash=").append(hash());
+ s.append(",url=").append(crypt.simpleEncode(comp.url().toNormalform()));
+ s.append(",descr=").append(crypt.simpleEncode(comp.descr()));
+ s.append(",author=").append(crypt.simpleEncode(comp.author()));
+ s.append(",tags=").append(crypt.simpleEncode(comp.tags()));
+ s.append(",ETag=").append(crypt.simpleEncode(comp.ETag()));
+ s.append(",mod=").append(indexURL.shortDayFormatter.format(moddate()));
+ s.append(",load=").append(indexURL.shortDayFormatter.format(loaddate()));
+ s.append(",referrer=").append(referrerHash());
+ s.append(",md5=").append(md5());
+ s.append(",size=").append(size());
+ s.append(",wc=").append(wordCount());
+ s.append(",dt=").append(doctype());
+ s.append(",flags=").append(serverCodings.encodeHex(flags().getBytes()));
+ s.append(",lang=").append(language());
+ s.append(",llocal=").append(llocal());
+ s.append(",lother=").append(lother());
+ s.append(",limage=").append(limage());
+ s.append(",laudio=").append(laudio());
+ s.append(",lvideo=").append(lvideo());
+ s.append(",lapp=").append(lapp());
+
+ if (this.word != null) {
+ // append also word properties
+ s.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm(false)));
+ }
+ return s;
+
+ } catch (Exception e) {
+ // serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
+ // if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
+ // if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null");
+ // e.printStackTrace();
+ return null;
+ }
+ }
+
+ public kelondroRow.Entry toRowEntry() throws IOException {
+ return this.entry;
+ }
+
+ public String hash() {
+ // return a url-hash, based on the md5 algorithm
+ // the result is a String of 12 bytes within a 72-bit space
+ // (each byte has an 6-bit range)
+ // that should be enough for all web pages on the world
+ return this.entry.getColString("hash", "", null);
+ }
+
+ public de.anomic.plasma.plasmaCrawlLURLEntry.Components comp() {
+ ArrayList cl = nxTools.strings(this.entry.getCol("comp", null), "UTF-8");
+ return new de.anomic.plasma.plasmaCrawlLURLEntry.Components(
+ (cl.size() > 0) ? (String) cl.get(0) : "",
+ (cl.size() > 1) ? (String) cl.get(1) : "",
+ (cl.size() > 2) ? (String) cl.get(2) : "",
+ (cl.size() > 3) ? (String) cl.get(3) : "",
+ (cl.size() > 4) ? (String) cl.get(4) : "");
+ }
+
+ public Date moddate() {
+ return new Date(86400000 * entry.getColLong("mod", 0));
+ }
+
+ public Date loaddate() {
+ return new Date(86400000 * entry.getColLong("load", 0));
+ }
+
+ public String referrerHash() {
+ // return the creator's hash
+ return entry.getColString("referrer", indexURL.dummyHash, null);
+ }
+
+ public String md5() {
+ // returns the md5 in hex representation
+ return serverCodings.encodeHex(entry.getCol("md5", indexURL.dummyHash.getBytes()));
+ }
+
+ public char doctype() {
+ return (char) entry.getColByte("dt", (byte) 't');
+ }
+
+ public String language() {
+ return this.entry.getColString("lang", "uk", null);
+ }
+
+ public int size() {
+ return (int) this.entry.getColLong("size", 0);
+ }
+
+ public bitfield flags() {
+ return new bitfield(this.entry.getCol("flags", new byte[4]));
+ }
+
+ public int wordCount() {
+ return (int) this.entry.getColLong("wc", 0);
+ }
+
+ public int llocal() {
+ return (int) this.entry.getColLong("llocal", 0);
+ }
+
+ public int lother() {
+ return (int) this.entry.getColLong("lother", 0);
+ }
+
+ public int limage() {
+ return (int) this.entry.getColLong("limage", 0);
+ }
+
+ public int laudio() {
+ return (int) this.entry.getColLong("laudio", 0);
+ }
+
+ public int lvideo() {
+ return (int) this.entry.getColLong("lvideo", 0);
+ }
+
+ public int lapp() {
+ return (int) this.entry.getColLong("lapp", 0);
+ }
+
+ public String snippet() {
+ // the snippet may appear here if the url was transported in a remote search
+ // it will not be saved anywhere, but can only be requested here
+ return snippet;
+ }
+
+ public indexEntry word() {
+ return word;
+ }
+
+ public boolean isOlder(plasmaCrawlLURLEntry other) {
+ if (other == null) return false;
+ Date tmoddate = moddate();
+ Date omoddate = other.moddate();
+ if (tmoddate.before(omoddate)) return true;
+ if (tmoddate.equals(omoddate)) {
+ Date tloaddate = loaddate();
+ Date oloaddate = other.loaddate();
+ if (tloaddate.before(oloaddate)) return true;
+ if (tloaddate.equals(oloaddate)) return true;
+ }
+ return false;
+ }
+
+ public String toString(String snippet) {
+ // add information needed for remote transport
+ final StringBuffer core = corePropList();
+ if (core == null)
+ return null;
+
+ core.ensureCapacity(core.length() + snippet.length() * 2);
+ core.insert(0, "{");
+ core.append(",snippet=").append(crypt.simpleEncode(snippet));
+ core.append("}");
+
+ return core.toString();
+ //return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
+ }
+
+ /**
+ * Returns this object as String.
+ * This e.g. looks like this:
+ * {hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}
+ */
+ public String toString() {
+ final StringBuffer core = corePropList();
+ if (core == null) return null;
+
+ core.insert(0, "{");
+ core.append("}");
+
+ return core.toString();
+ //return "{" + core + "}";
+ }
+
+}
diff --git a/source/de/anomic/plasma/plasmaCrawlLURLOldEntry.java b/source/de/anomic/plasma/plasmaCrawlLURLOldEntry.java
index b6c9aa09f..570711e98 100644
--- a/source/de/anomic/plasma/plasmaCrawlLURLOldEntry.java
+++ b/source/de/anomic/plasma/plasmaCrawlLURLOldEntry.java
@@ -36,7 +36,6 @@ import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroRow;
-import de.anomic.net.URL;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.crypt;
@@ -57,7 +56,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
"Cardinal size-" + indexURL.urlSizeLength + " {b64e}, " + // size of file in bytes
"Cardinal wc-" + indexURL.urlWordCountLength + " {b64e}"); // word count
- private URL url;
+ private String url;
private String descr;
private Date moddate;
private Date loaddate;
@@ -73,19 +72,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
private String snippet;
private indexEntry word; // this is only used if the url is transported via remote search requests
- // more needed attributes:
- // - author / copyright owner
- // - keywords
- // - phrasecount, total number of phrases
- // - boolean: URL attributes (see Word-Entity definition)
- // - boolean: appearance of bold and/or italics
- // - ETag: for re-crawl decision upon HEAD request
- // - int: # of outlinks to same domain
- // - int: # of outlinks to outside domain
- // - int: # of keywords
- // - int: # der auf der Seite vorhandenen Links zu image, audio, video, applications
-
- public plasmaCrawlLURLOldEntry(URL url, String descr, Date moddate,
+ public plasmaCrawlLURLOldEntry(String url, String descr, Date moddate,
Date loaddate, String referrerHash, int copyCount,
boolean localNeed, int quality, String language, char doctype,
int size, int wordCount) {
@@ -110,7 +97,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
public plasmaCrawlLURLOldEntry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException {
try {
this.urlHash = entry.getColString(0, null);
- this.url = new URL(entry.getColString(1, "UTF-8").trim());
+ this.url = entry.getColString(1, "UTF-8").trim();
this.descr = (entry.empty(2)) ? this.url.toString() : entry.getColString(2, "UTF-8").trim();
this.moddate = new Date(86400000 * entry.getColLong(3));
this.loaddate = new Date(86400000 * entry.getColLong(4));
@@ -144,7 +131,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
this.copyCount = Integer.parseInt(prop.getProperty("cc", "0"));
this.flags = ((prop.getProperty("local", "true").equals("true")) ? "L " : " ");
if (setGlobal) this.flags = "G ";
- this.url = new URL(crypt.simpleDecode(prop.getProperty("url", ""), null));
+ this.url = crypt.simpleDecode(prop.getProperty("url", ""), null);
this.descr = crypt.simpleDecode(prop.getProperty("descr", ""), null);
if (this.descr == null) this.descr = this.url.toString();
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(prop.getProperty("q", ""));
@@ -195,13 +182,9 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
// that should be enough for all web pages on the world
return this.urlHash;
}
-
- public URL url() {
- return url;
- }
-
- public String descr() {
- return descr;
+
+ public Components comp() {
+ return new Components(url, descr, "", "", "");
}
public Date moddate() {
@@ -263,9 +246,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
if (moddate.before(other.moddate())) return true;
if (moddate.equals(other.moddate())) {
if (loaddate.before(other.loaddate())) return true;
- if (loaddate.equals(other.loaddate())) {
- if (quality < other.quality()) return true;
- }
+ if (loaddate.equals(other.loaddate())) return true;
}
return false;
}
@@ -297,30 +278,10 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
return corePropStr;
} catch (Exception e) {
- // serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
- // if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
- // if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null");
- // e.printStackTrace();
return null;
}
}
- /*
- public String toString(int posintext, int posinphrase, int posofphrase) {
- // add information needed for remote transport
- final StringBuffer core = corePropList();
- if (core == null) return null;
-
- core.ensureCapacity(core.length() + 200);
- core.insert(0,"{")
- .append(",posintext=").append(posintext)
- .append(",posinphrase=").append(posinphrase)
- .append(",posofphraseint=").append(posofphrase)
- .append("}");
- return core.toString();
- }
- */
-
public String toString(String snippet) {
// add information needed for remote transport
final StringBuffer core = corePropList();
diff --git a/source/de/anomic/plasma/plasmaDHTChunk.java b/source/de/anomic/plasma/plasmaDHTChunk.java
index c827ee6af..a7eea14e0 100644
--- a/source/de/anomic/plasma/plasmaDHTChunk.java
+++ b/source/de/anomic/plasma/plasmaDHTChunk.java
@@ -227,7 +227,7 @@ public class plasmaDHTChunk {
while ((urlIter.hasNext()) && (maxcount > refcount) && (System.currentTimeMillis() < timeout)) {
iEntry = (indexEntry) urlIter.next();
lurl = lurls.load(iEntry.urlHash(), iEntry);
- if ((lurl == null) || (lurl.url() == null)) {
+ if ((lurl == null) || (lurl.comp().url() == null)) {
//yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash());
notBoundCounter++;
urlIter.remove();
diff --git a/source/de/anomic/plasma/plasmaSearchImages.java b/source/de/anomic/plasma/plasmaSearchImages.java
index d849b394d..78a834304 100644
--- a/source/de/anomic/plasma/plasmaSearchImages.java
+++ b/source/de/anomic/plasma/plasmaSearchImages.java
@@ -104,7 +104,7 @@ public final class plasmaSearchImages {
plasmaCrawlLURLEntry urlentry;
while (sres.hasMoreElements()) {
urlentry = sres.nextElement();
- addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), urlentry.url(), depth));
+ addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), urlentry.comp().url(), depth));
}
}
diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java
index 355f60839..b628ed45b 100644
--- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java
+++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java
@@ -197,8 +197,9 @@ public class plasmaSearchRankingProfile {
long ranking = preranking;
// prefer hit with 'prefer' pattern
- if (page.url().toString().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();
- if (page.descr().toString().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();
+ plasmaCrawlLURLEntry.Components comp = page.comp();
+ if (comp.url().toNormalform().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();
+ if (comp.descr().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();
// apply 'common-sense' heuristic using references
for (int j = 0; j < urlcomps.length; j++) {
@@ -220,11 +221,11 @@ public class plasmaSearchRankingProfile {
}
// prefer short urls
- ranking += (256 - page.url().toString().length()) << ((Integer) coeff.get(URLLENGTH)).intValue();
+ ranking += (256 - comp.url().toNormalform().length()) << ((Integer) coeff.get(URLLENGTH)).intValue();
ranking += (8 * Math.max(0, 32 - urlcomps.length)) << ((Integer) coeff.get(URLCOMPS)).intValue();
// prefer long descriptions
- ranking += (256 * page.descr().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue();
+ ranking += (256 * comp.url().toNormalform().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue();
ranking += (256 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue();
return ranking;
diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java
index 0a2234ce3..0878c2350 100644
--- a/source/de/anomic/plasma/plasmaSearchResult.java
+++ b/source/de/anomic/plasma/plasmaSearchResult.java
@@ -108,11 +108,10 @@ public final class plasmaSearchResult {
protected void addResult(plasmaCrawlLURLEntry page, Long preranking) {
// take out relevant information for reference computation
- URL url = page.url();
- String descr = page.descr();
- if ((url == null) || (descr == null)) return;
- String[] urlcomps = htmlFilterContentScraper.urlComps(url.toString()); // word components of the url
- String[] descrcomps = descr.toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description
+ plasmaCrawlLURLEntry.Components comp = page.comp();
+ if ((comp.url() == null) || (comp.descr() == null)) return;
+ String[] urlcomps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()); // word components of the url
+ String[] descrcomps = comp.descr().toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description
// store everything
results.add(new Object[] {page, urlcomps, descrcomps, preranking});
@@ -168,12 +167,12 @@ public final class plasmaSearchResult {
Iterator i = pageAcc.entrySet().iterator();
HashMap paths = new HashMap(); // a url-subpath to pageAcc-key relation
Map.Entry entry;
- String path;
+ String path = null;
// first scan all entries and find all urls that are referenced
while (i.hasNext()) {
entry = (Map.Entry) i.next();
- path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).url());
+ path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).comp().url());
paths.put(path, entry.getKey());
//if (path != null) path = shortenPath(path);
//if (path != null) paths.put(path, entry.getKey());
@@ -184,7 +183,7 @@ public final class plasmaSearchResult {
String shorten;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
- path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).url());
+ path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).comp().url());
shorten = shortenPath(path);
// scan all subpaths of the url
while (shorten != null) {
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index e6e6516aa..503570692 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -289,6 +289,7 @@ public class plasmaSnippetCache {
* @return the parsed document as {@link plasmaParserDocument}
*/
public plasmaParserDocument retrieveDocument(URL url, boolean fetchOnline) {
+ if (url == null) return null;
IResourceInfo docInfo = null;
try {
// trying to load the resource body from cache
@@ -634,11 +635,12 @@ public class plasmaSnippetCache {
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) {
urlentry = acc.nextElement();
- if (urlentry.url().getHost().endsWith(".yacyh")) continue;
- urlstring = urlentry.url().toNormalform();
+ plasmaCrawlLURLEntry.Components comp = urlentry.comp();
+ if (comp.url().getHost().endsWith(".yacyh")) continue;
+ urlstring = comp.url().toNormalform();
if ((urlstring.matches(urlmask)) &&
- (!(existsInCache(urlentry.url(), queryhashes)))) {
- new Fetcher(urlentry.url(), queryhashes, (int) maxTime).start();
+ (!(existsInCache(comp.url(), queryhashes)))) {
+ new Fetcher(comp.url(), queryhashes, (int) maxTime).start();
i++;
}
}
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 62fc2ea03..16ac55d03 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -1559,7 +1559,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// create a new loaded URL db entry
plasmaCrawlLURLEntry newEntry = urlPool.loadedURL.newEntry(
- entry.url(), // URL
+ entry.url().toNormalform(), // URL
docDescription, // document description
docDate, // modification date
new Date(), // loaded date
@@ -1641,8 +1641,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String language = indexEntryAttribute.language(entry.url());
char doctype = indexEntryAttribute.docType(document.getMimeType());
- int urlLength = newEntry.url().toString().length();
- int urlComps = htmlFilterContentScraper.urlComps(newEntry.url().toString()).length;
+ plasmaCrawlLURLEntry.Components comp = newEntry.comp();
+ int urlLength = comp.url().toNormalform().length();
+ int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()).length;
// iterate over all words
Iterator i = condenser.words();
@@ -2046,10 +2047,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
prop.put("type_globalresults", acc.globalContributions);
int i = 0;
int p;
- URL url;
plasmaCrawlLURLEntry urlentry;
String urlstring, urlname, filename, urlhash;
- String host, hash, address, descr = "";
+ String host, hash, address;
yacySeed seed;
plasmaSnippetCache.Snippet snippet;
boolean includeSnippets = false;
@@ -2058,30 +2058,29 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (targetTime < System.currentTimeMillis()) targetTime = System.currentTimeMillis() + 1000;
while ((acc.hasMoreElements()) && (i < query.wantedResults) && (System.currentTimeMillis() < targetTime)) {
urlentry = acc.nextElement();
- url = urlentry.url();
+ plasmaCrawlLURLEntry.Components comp = urlentry.comp();
urlhash = urlentry.hash();
- host = url.getHost();
+ host = comp.url().getHost();
if (host.endsWith(".yacyh")) {
// translate host into current IP
p = host.indexOf(".");
hash = yacySeed.hexHash2b64Hash(host.substring(p + 1, host.length() - 6));
seed = yacyCore.seedDB.getConnected(hash);
- filename = url.getFile();
+ filename = comp.url().getFile();
if ((seed == null) || ((address = seed.getAddress()) == null)) {
// seed is not known from here
- removeReferences(urlentry.hash(), plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + urlentry.descr()).getBytes()));
+ removeReferences(urlentry.hash(), plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes()));
urlPool.loadedURL.remove(urlentry.hash()); // clean up
continue; // next result
}
- url = new URL("http://" + address + "/" + host.substring(0, p) + filename);
urlname = "http://share." + seed.getName() + ".yacy" + filename;
if ((p = urlname.indexOf("?")) > 0) urlname = urlname.substring(0, p);
- urlstring = url.toNormalform();
+ urlstring = "http://" + address + "/" + host.substring(0, p) + filename;
} else {
- urlstring = url.toNormalform();
+ urlstring = comp.url().toNormalform();
urlname = urlstring;
}
- descr = urlentry.descr();
+
// check bluelist again: filter out all links where any bluelisted word
// appear either in url, url's description or search word
@@ -2097,7 +2096,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
URL wordURL;
if (urlstring.matches(query.urlMask)) { //.* is default
if (includeSnippets) {
- snippet = snippetCache.retrieveSnippet(url, query.queryHashes, false, 260, 1000);
+ snippet = snippetCache.retrieveSnippet(comp.url(), query.queryHashes, false, 260, 1000);
} else {
snippet = null;
}
@@ -2107,7 +2106,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
prop.put("type_results_" + i + "_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, "stippadd", "url", urlstring) == null) ? 1 : 0);
prop.put("type_results_" + i + "_recommend_deletelink", "/yacysearch.html?search=" + formerSearch + "&Enter=Search&count=" + query.wantedResults + "&order=" + ranking.orderString() + "&resource=local&time=3&deleteref=" + urlhash + "&urlmaskfilter=.*");
prop.put("type_results_" + i + "_recommend_recommendlink", "/yacysearch.html?search=" + formerSearch + "&Enter=Search&count=" + query.wantedResults + "&order=" + ranking.orderString() + "&resource=local&time=3&recommendref=" + urlhash + "&urlmaskfilter=.*");
- prop.put("type_results_" + i + "_description", descr);
+ prop.put("type_results_" + i + "_description", comp.descr());
prop.put("type_results_" + i + "_url", urlstring);
prop.put("type_results_" + i + "_urlhash", urlhash);
prop.put("type_results_" + i + "_urlhexhash", yacySeed.b64Hash2hexHash(urlhash));
@@ -2196,19 +2195,18 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// determine the url string
plasmaCrawlLURLEntry entry = urlPool.loadedURL.load(urlhash, null);
if (entry == null) return 0;
-
- URL url = entry.url();
- if (url == null) return 0;
+ plasmaCrawlLURLEntry.Components comp = entry.comp();
+ if (comp.url() == null) return 0;
InputStream resourceContent = null;
try {
// get the resource content
- Object[] resource = snippetCache.getResource(url, fetchOnline, 10000);
+ Object[] resource = snippetCache.getResource(comp.url(), fetchOnline, 10000);
resourceContent = (InputStream) resource[0];
Long resourceContentLength = (Long) resource[1];
// parse the resource
- plasmaParserDocument document = snippetCache.parseDocument(url, resourceContentLength.longValue(), resourceContent);
+ plasmaParserDocument document = snippetCache.parseDocument(comp.url(), resourceContentLength.longValue(), resourceContent);
// getting parsed body input stream
InputStream docBodyInputStream = document.getText();
diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java
index cfe7b1391..584d1ff53 100644
--- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java
+++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java
@@ -334,7 +334,7 @@ public class plasmaSwitchboardQueue {
if (referrerURL == null) {
if ((referrerHash == null) || (referrerHash.equals(indexURL.dummyHash))) return null;
plasmaCrawlLURLEntry entry = lurls.load(referrerHash, null);
- if (entry == null) referrerURL = null; else referrerURL = entry.url();
+ if (entry == null) referrerURL = null; else referrerURL = entry.comp().url();
}
return referrerURL;
}
diff --git a/source/de/anomic/plasma/plasmaURLPool.java b/source/de/anomic/plasma/plasmaURLPool.java
index d1d4e0940..e02af682f 100644
--- a/source/de/anomic/plasma/plasmaURLPool.java
+++ b/source/de/anomic/plasma/plasmaURLPool.java
@@ -84,7 +84,7 @@ public class plasmaURLPool {
if (ne != null) return ne.url();
} catch (IOException e) {}
plasmaCrawlLURLEntry le = loadedURL.load(urlhash, null);
- if (le != null) return le.url();
+ if (le != null) return le.comp().url();
plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);
if (ee != null) return ee.url();
return null;
diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java
index 3690804ea..63bc44184 100644
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@@ -780,7 +780,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
if (ue == null) {
urlHashs.add(entry.urlHash());
} else {
- url = ue.url();
+ url = ue.comp().url();
if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url) == true)) {
urlHashs.add(entry.urlHash());
}
diff --git a/source/de/anomic/tools/bitfield.java b/source/de/anomic/tools/bitfield.java
index 95e3534ca..b417145ab 100644
--- a/source/de/anomic/tools/bitfield.java
+++ b/source/de/anomic/tools/bitfield.java
@@ -46,7 +46,7 @@ public class bitfield {
public bitfield(int bytelength) {
this.bb= new byte[bytelength];
- for (int i = 0 ; i < bytelength; i++) bb[i] = (char) 48;
+ for (int i = 0 ; i < bytelength; i++) bb[i] = 0;
}
public bitfield(byte[] field) {
diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java
index 83d5a8190..5d400530b 100644
--- a/source/de/anomic/yacy/yacyClient.java
+++ b/source/de/anomic/yacy/yacyClient.java
@@ -502,7 +502,9 @@ public final class yacyClient {
for (int n = 0; n < results; n++) {
// get one single search result
urlEntry = urlManager.newEntry((String) result.get("resource" + n), true);
- if ((urlEntry == null) || (blacklist.isListed(plasmaURLPattern.BLACKLIST_SEARCH, urlEntry.url()))) { continue; } // block with backlist
+ if (urlEntry == null) continue;
+ plasmaCrawlLURLEntry.Components comp = urlEntry.comp();
+ if (blacklist.isListed(plasmaURLPattern.BLACKLIST_SEARCH, comp.url())) continue; // block with backlist
urlManager.store(urlEntry);
urlManager.stack(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
@@ -510,19 +512,20 @@ public final class yacyClient {
final indexEntry entry;
if (urlEntry.word() == null) {
// the old way to define words
- int urlLength = urlEntry.url().toString().length();
- int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length;
+ int urlLength = comp.url().toNormalform().length();
+ int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()).length;
entry = new indexURLEntry(
urlEntry.hash(),
- urlLength, urlComps,
- urlEntry.descr().length(),
+ urlLength,
+ urlComps,
+ comp.descr().length(),
urlEntry.wordCount(),
0, 0, 0, 0, 0, 0,
urlEntry.size(),
urlEntry.moddate().getTime(),
System.currentTimeMillis(),
- urlEntry.quality(),
+ 0,
urlEntry.language(),
urlEntry.doctype(),
0,0,
diff --git a/source/yacy.java b/source/yacy.java
index 3acdea737..01a4d055e 100644
--- a/source/yacy.java
+++ b/source/yacy.java
@@ -958,7 +958,8 @@ public final class yacy {
while (eiter.hasNext()) {
try {
entry = (plasmaCrawlLURLEntry) eiter.next();
- if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), null);
+ plasmaCrawlLURLEntry.Components comp = entry.comp();
+ if ((entry != null) && (comp.url() != null)) doms.put(comp.url().getHost(), null);
} catch (Exception e) {
// here a MalformedURLException may occur
// just ignore
@@ -1068,12 +1069,13 @@ public final class yacy {
plasmaCrawlLURLEntry entry;
while (eiter.hasNext()) {
entry = (plasmaCrawlLURLEntry) eiter.next();
- if ((entry != null) && (entry.url() != null)) {
+ plasmaCrawlLURLEntry.Components comp = entry.comp();
+ if ((entry != null) && (comp.url() != null)) {
if (html) {
- bos.write(("" + entry.descr() + "
").getBytes("UTF-8"));
+ bos.write(("" + comp.descr() + "
").getBytes("UTF-8"));
bos.write(serverCore.crlf);
} else {
- bos.write(entry.url().toString().getBytes());
+ bos.write(comp.url().toNormalform().getBytes());
bos.write(serverCore.crlf);
}
}
@@ -1128,7 +1130,8 @@ public final class yacy {
plasmaCrawlLURLEntry entry;
while (eiter.hasNext()) {
entry = (plasmaCrawlLURLEntry) eiter.next();
- if ((entry != null) && (entry.url() != null)) {
+ plasmaCrawlLURLEntry.Components comp = entry.comp();
+ if ((entry != null) && (comp.url() != null)) {
fsp.put(entry.toRowEntry(), entry.loaddate());
}
}