- added new LURL.Entry class for next database migration

- refactoring of affected classes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2802 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent cc24dde5e0
commit b79e06615d

@ -149,15 +149,15 @@ public class Bookmarks {
// try to get the bookmark from the LURL database // try to get the bookmark from the LURL database
plasmaCrawlLURLEntry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null); plasmaCrawlLURLEntry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null);
plasmaParserDocument document = null; plasmaParserDocument document = null;
if(urlentry != null){
document = switchboard.snippetCache.retrieveDocument(urlentry.url(), true);
}
if (urlentry != null) { if (urlentry != null) {
plasmaCrawlLURLEntry.Components comp = urlentry.comp();
document = switchboard.snippetCache.retrieveDocument(comp.url(), true);
prop.put("mode_edit", 0); // create mode prop.put("mode_edit", 0); // create mode
prop.put("mode_title", urlentry.descr()); prop.put("mode_url", comp.url().toNormalform());
prop.put("mode_description", (document == null) ? urlentry.descr() : document.getMainLongTitle()); prop.put("mode_title", comp.descr());
prop.put("mode_url", urlentry.url()); prop.put("mode_description", (document == null) ? comp.descr(): document.getMainLongTitle());
prop.put("mode_tags", (document == null) ? "" : document.getKeywords(',')); prop.put("mode_author", comp.author());
prop.put("mode_tags", (document == null) ? comp.tags() : document.getKeywords(','));
prop.put("mode_public", 0); prop.put("mode_public", 0);
} }
if (document != null) document.close(); if (document != null) document.close();

@ -164,9 +164,6 @@
<tr><td class="small">Loaded-Date</td><td class="tt">#[loaddate]#</td></tr> <tr><td class="small">Loaded-Date</td><td class="tt">#[loaddate]#</td></tr>
<tr><td class="small">Referrer</td><td class="tt">#[referrer]#</td></tr> <tr><td class="small">Referrer</td><td class="tt">#[referrer]#</td></tr>
<tr><td class="small">Doctype</td><td class="tt">#[doctype]#</td></tr> <tr><td class="small">Doctype</td><td class="tt">#[doctype]#</td></tr>
<tr><td class="small">Copy-Count</td><td class="tt">#[copyCount]#</td></tr>
<tr><td class="small">Local-Flag</td><td class="tt">#[local]#</td></tr>
<tr><td class="small">Quality</td><td class="tt">#[quality]#</td></tr>
<tr><td class="small">Language</td><td class="tt">#[language]#</td></tr> <tr><td class="small">Language</td><td class="tt">#[language]#</td></tr>
<tr><td class="small">Size</td><td class="tt">#[size]#</td></tr> <tr><td class="small">Size</td><td class="tt">#[size]#</td></tr>
<tr><td class="small">Words</td><td class="tt">#[wordCount]#</td></tr> <tr><td class="small">Words</td><td class="tt">#[wordCount]#</td></tr>

@ -222,8 +222,7 @@ public class IndexControl_p {
if (entry == null) { if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else { } else {
URL url = entry.url(); urlstring = entry.comp().url().toNormalform();
urlstring = url.toNormalform();
prop.put("urlstring", ""); prop.put("urlstring", "");
switchboard.urlPool.loadedURL.remove(urlhash); switchboard.urlPool.loadedURL.remove(urlhash);
prop.put("result", "Removed URL " + urlstring); prop.put("result", "Removed URL " + urlstring);
@ -339,9 +338,7 @@ public class IndexControl_p {
if (entry == null) { if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash); prop.put("result", "No Entry for URL hash " + urlhash);
} else { } else {
URL url = entry.url(); prop.put("urlstring", entry.comp().url().toNormalform());
urlstring = url.toString();
prop.put("urlstring", urlstring);
prop.putAll(genUrlProfile(switchboard, entry, urlhash)); prop.putAll(genUrlProfile(switchboard, entry, urlhash));
} }
} }
@ -410,30 +407,27 @@ public class IndexControl_p {
prop.put("genUrlProfile_urlhash", urlhash); prop.put("genUrlProfile_urlhash", urlhash);
return prop; return prop;
} }
URL url = entry.url(); plasmaCrawlLURLEntry.Components comp = entry.comp();
String referrer = null; String referrer = null;
plasmaCrawlLURLEntry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null); plasmaCrawlLURLEntry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null);
if (le == null) { if (le == null) {
referrer = "<unknown>"; referrer = "<unknown>";
} else { } else {
referrer = le.url().toString(); referrer = le.comp().url().toNormalform();
} }
if (url == null) { if (comp.url() == null) {
prop.put("genUrlProfile", 1); prop.put("genUrlProfile", 1);
prop.put("genUrlProfile_urlhash", urlhash); prop.put("genUrlProfile_urlhash", urlhash);
return prop; return prop;
} }
prop.put("genUrlProfile", 2); prop.put("genUrlProfile", 2);
prop.put("genUrlProfile_urlNormalform", url.toNormalform()); prop.put("genUrlProfile_urlNormalform", comp.url().toNormalform());
prop.put("genUrlProfile_urlhash", urlhash); prop.put("genUrlProfile_urlhash", urlhash);
prop.put("genUrlProfile_urlDescr", entry.descr()); prop.put("genUrlProfile_urlDescr", comp.descr());
prop.put("genUrlProfile_moddate", entry.moddate()); prop.put("genUrlProfile_moddate", entry.moddate());
prop.put("genUrlProfile_loaddate", entry.loaddate()); prop.put("genUrlProfile_loaddate", entry.loaddate());
prop.put("genUrlProfile_referrer", referrer); prop.put("genUrlProfile_referrer", referrer);
prop.put("genUrlProfile_doctype", ""+entry.doctype()); prop.put("genUrlProfile_doctype", ""+entry.doctype());
prop.put("genUrlProfile_copyCount", entry.copyCount());
prop.put("genUrlProfile_local", ""+entry.local());
prop.put("genUrlProfile_quality", entry.quality());
prop.put("genUrlProfile_language", entry.language()); prop.put("genUrlProfile_language", entry.language());
prop.put("genUrlProfile_size", entry.size()); prop.put("genUrlProfile_size", entry.size());
prop.put("genUrlProfile_wordCount", entry.wordCount()); prop.put("genUrlProfile_wordCount", entry.wordCount());
@ -467,7 +461,7 @@ public class IndexControl_p {
if (le == null) { if (le == null) {
tm.put(uh[0], uh); tm.put(uh[0], uh);
} else { } else {
us = le.url().toString(); us = le.comp().url().toNormalform();
tm.put(us, uh); tm.put(us, uh);
} }

@ -337,6 +337,7 @@ public class PerformanceMemory_p {
} }
private static void putprop(serverObjects prop, serverSwitch env, String wdb, String db, String set) { private static void putprop(serverObjects prop, serverSwitch env, String wdb, String db, String set) {
if ((slt == null) || (ost == null)) return;
usd = chk * slt[1] + obj * ost[2] /*hit*/ + kelondroTree.cacheObjectMissSize * ost[3] /*miss*/; usd = chk * slt[1] + obj * ost[2] /*hit*/ + kelondroTree.cacheObjectMissSize * ost[3] /*miss*/;
bst = (((((long) chk) * ((long) req)) >> 10) + 1) << 10; bst = (((((long) chk) * ((long) req)) >> 10) + 1) << 10;
if (set.equals("setBest")) env.setConfig("ramCache" + db, bst); if (set.equals("setBest")) env.setConfig("ramCache" + db, bst);

@ -54,7 +54,6 @@ import java.util.Enumeration;
import de.anomic.data.wikiCode; import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaParserDocument;
@ -116,12 +115,12 @@ public class ViewFile {
} }
// gettin the url that belongs to the entry // gettin the url that belongs to the entry
URL url = urlEntry.url(); plasmaCrawlLURLEntry.Components comp = urlEntry.comp();
if (url == null) { if ((comp == null) || (comp.url() == null)) {
prop.put("error",3); prop.put("error",3);
prop.put("viewMode",VIEW_MODE_NO_TEXT); prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop; return prop;
} }
// loading the resource content as byte array // loading the resource content as byte array
InputStream resource = null; InputStream resource = null;
@ -130,14 +129,14 @@ public class ViewFile {
String resMime = null; String resMime = null;
try { try {
// trying to load the resource body // trying to load the resource body
resource = sb.cacheManager.getResourceContentStream(url); resource = sb.cacheManager.getResourceContentStream(comp.url());
resourceLength = sb.cacheManager.getResourceContentLength(url); resourceLength = sb.cacheManager.getResourceContentLength(comp.url());
// if the resource body was not cached we try to load it from web // if the resource body was not cached we try to load it from web
if (resource == null) { if (resource == null) {
plasmaHTCache.Entry entry = null; plasmaHTCache.Entry entry = null;
try { try {
entry = sb.snippetCache.loadResourceFromWeb(url, 5000, false); entry = sb.snippetCache.loadResourceFromWeb(comp.url(), 5000, false);
} catch (plasmaCrawlerException e) { } catch (plasmaCrawlerException e) {
prop.put("error",4); prop.put("error",4);
prop.put("error_errorText",e.getMessage()); prop.put("error_errorText",e.getMessage());
@ -147,8 +146,8 @@ public class ViewFile {
if (entry != null) { if (entry != null) {
resInfo = entry.getDocumentInfo(); resInfo = entry.getDocumentInfo();
resource = sb.cacheManager.getResourceContentStream(url); resource = sb.cacheManager.getResourceContentStream(comp.url());
resourceLength = sb.cacheManager.getResourceContentLength(url); resourceLength = sb.cacheManager.getResourceContentLength(comp.url());
} }
if (resource == null) { if (resource == null) {
@ -164,19 +163,19 @@ public class ViewFile {
// try to load the metadata from cache // try to load the metadata from cache
try { try {
resInfo = sb.cacheManager.loadResourceInfo(urlEntry.url()); resInfo = sb.cacheManager.loadResourceInfo(comp.url());
} catch (Exception e) { /* ignore this */} } catch (Exception e) { /* ignore this */}
// if the metadata where not cached try to load it from web // if the metadata where not cached try to load it from web
if (resInfo == null) { if (resInfo == null) {
String protocol = url.getProtocol(); String protocol = comp.url().getProtocol();
if (!((protocol.equals("http") || protocol.equals("https")))) { if (!((protocol.equals("http") || protocol.equals("https")))) {
prop.put("error",6); prop.put("error",6);
prop.put("viewMode",VIEW_MODE_NO_TEXT); prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop; return prop;
} }
httpHeader responseHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig); httpHeader responseHeader = httpc.whead(comp.url(),comp.url().getHost(),5000,null,null,sb.remoteProxyConfig);
if (responseHeader == null) { if (responseHeader == null) {
prop.put("error",4); prop.put("error",4);
prop.put("error_errorText","Unable to load resource metadata."); prop.put("error_errorText","Unable to load resource metadata.");
@ -184,7 +183,7 @@ public class ViewFile {
return prop; return prop;
} }
try { try {
resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, responseHeader); resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(comp.url(), responseHeader);
} catch (Exception e) { } catch (Exception e) {
prop.put("error",4); prop.put("error",4);
prop.put("error_errorText",e.getMessage()); prop.put("error_errorText",e.getMessage());
@ -230,12 +229,12 @@ public class ViewFile {
prop.put("viewMode_plainText",content); prop.put("viewMode_plainText",content);
} else if (viewMode.equals("iframe")) { } else if (viewMode.equals("iframe")) {
prop.put("viewMode",VIEW_MODE_AS_IFRAME); prop.put("viewMode",VIEW_MODE_AS_IFRAME);
prop.put("viewMode_url",url.toString()); prop.put("viewMode_url",comp.url().toNormalform());
} else if (viewMode.equals("parsed") || viewMode.equals("sentences")) { } else if (viewMode.equals("parsed") || viewMode.equals("sentences")) {
// parsing the resource content // parsing the resource content
plasmaParserDocument document = null; plasmaParserDocument document = null;
try { try {
document = sb.snippetCache.parseDocument(url, resourceLength, resource,resInfo); document = sb.snippetCache.parseDocument(comp.url(), resourceLength, resource,resInfo);
if (document == null) { if (document == null) {
prop.put("error",5); prop.put("error",5);
prop.put("error_errorText","Unknown error"); prop.put("error_errorText","Unknown error");
@ -295,13 +294,13 @@ public class ViewFile {
} }
if (document != null) document.close(); if (document != null) document.close();
} }
prop.put("error",0); prop.put("error", 0);
prop.put("error_url",url.toString()); prop.put("error_url", comp.url().toNormalform());
prop.put("error_hash",urlHash); prop.put("error_hash", urlHash);
prop.put("error_wordCount",Integer.toString(urlEntry.wordCount())); prop.put("error_wordCount", Integer.toString(urlEntry.wordCount()));
prop.put("error_desc",urlEntry.descr()); prop.put("error_desc", comp.descr());
prop.put("error_size",urlEntry.size()); prop.put("error_size", urlEntry.size());
prop.put("error_mimeType",resMime); prop.put("error_mimeType", resMime);
return prop; return prop;
} }

@ -359,7 +359,7 @@ public class dir {
final URL url = new URL(urlstring); final URL url = new URL(urlstring);
final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes())); final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()));
final plasmaCrawlLURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry( final plasmaCrawlLURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry(
url, "YaCyShare: " + descr, new Date(), new Date(), url.toNormalform(), "YaCyShare: " + descr, new Date(), new Date(),
"AAAAAAAAAAAA", /*referrer*/ "AAAAAAAAAAAA", /*referrer*/
0, /*copycount*/ 0, /*copycount*/
false, /*localneed*/ false, /*localneed*/

@ -125,8 +125,13 @@ public final class crawlReceipt {
} else if (result.equals("fill")) { } else if (result.equals("fill")) {
// generating a new loaded URL entry // generating a new loaded URL entry
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr, true); plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr, true);
if ((entry == null)||(entry.url()==null)) { if (entry == null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT for hash " + receivedUrlhash + " from peer " + iam + log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) for hash " + receivedUrlhash + " from peer " + iam +
"\n\tURL properties: "+ propStr);
} else {
plasmaCrawlLURLEntry.Components comp = entry.comp();
if (comp.url() == null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + receivedUrlhash + " from peer " + iam +
"\n\tURL properties: "+ propStr); "\n\tURL properties: "+ propStr);
} else try { } else try {
// put new entry into database // put new entry into database
@ -134,18 +139,18 @@ public final class crawlReceipt {
switchboard.urlPool.loadedURL.stack(entry, youare, iam, 1); switchboard.urlPool.loadedURL.stack(entry, youare, iam, 1);
// generating url hash // generating url hash
String newUrlHash = indexURL.urlHash(entry.url()); String newUrlHash = indexURL.urlHash(comp.url());
String oldUrlHash = indexURL.oldurlHash(entry.url()); String oldUrlHash = indexURL.oldurlHash(comp.url());
// removing URL from notice URL // removing URL from notice URL
switchboard.urlPool.noticeURL.remove(newUrlHash); switchboard.urlPool.noticeURL.remove(newUrlHash);
switchboard.urlPool.noticeURL.remove(oldUrlHash); switchboard.urlPool.noticeURL.remove(oldUrlHash);
log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + receivedUrlhash + ":" + entry.url()); log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + receivedUrlhash + ":" + comp.url().toNormalform());
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} }
}
// ready for more // ready for more
prop.put("delay", "10"); prop.put("delay", "10");
} else { } else {

@ -249,7 +249,7 @@ public final class search {
while ((acc.hasMoreElements()) && (i < squery.wantedResults)) { while ((acc.hasMoreElements()) && (i < squery.wantedResults)) {
urlentry = (plasmaCrawlLURLEntry) acc.nextElement(); urlentry = (plasmaCrawlLURLEntry) acc.nextElement();
if (includesnippet) { if (includesnippet) {
snippet = sb.snippetCache.retrieveSnippet(urlentry.url(), squery.queryHashes, false, 260, 1000); snippet = sb.snippetCache.retrieveSnippet(urlentry.comp().url(), squery.queryHashes, false, 260, 1000);
} else { } else {
snippet = null; snippet = null;
} }

@ -98,25 +98,29 @@ public final class transferURL {
yacyCore.log.logFine("transferURL: got null URL-string from peer " + otherPeerName); yacyCore.log.logFine("transferURL: got null URL-string from peer " + otherPeerName);
} else { } else {
lEntry = sb.urlPool.loadedURL.newEntry(urls, true); lEntry = sb.urlPool.loadedURL.newEntry(urls, true);
if ((lEntry != null) && (lEntry.url() != null)) { if (lEntry == null) {
if ((blockBlacklist) && yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
(plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), lEntry.url()))) {
int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
lEntry = null;
blocked++;
} else try {
sb.urlPool.loadedURL.store(lEntry);
sb.urlPool.loadedURL.stack(lEntry, iam, iam, 3);
yacyCore.log.logFine("transferURL: received URL '" + lEntry.url() + "' from peer " + otherPeerName);
received++;
} catch (IOException e) {
e.printStackTrace();
}
} else {
yacyCore.log.logWarning("transferURL: received invalid URL from peer " + otherPeerName +
"\n\tURL Property: " + urls);
// TODO: should we send back an error message??? // TODO: should we send back an error message???
} else {
plasmaCrawlLURLEntry.Components comp = lEntry.comp();
if (comp.url() == null) {
yacyCore.log.logWarning("transferURL: received invalid URL (url null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
// TODO: should we send back an error message???
} else {
if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), comp.url()))) {
int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
lEntry = null;
blocked++;
} else try {
sb.urlPool.loadedURL.store(lEntry);
sb.urlPool.loadedURL.stack(lEntry, iam, iam, 3);
yacyCore.log.logFine("transferURL: received URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName);
received++;
} catch (IOException e) {
e.printStackTrace();
}
}
} }
} }
} }

@ -191,13 +191,15 @@ public class yacysearch {
final String recommendHash = post.get("recommendref", ""); // urlhash final String recommendHash = post.get("recommendref", ""); // urlhash
plasmaCrawlLURLEntry urlentry = sb.urlPool.loadedURL.load(recommendHash, null); plasmaCrawlLURLEntry urlentry = sb.urlPool.loadedURL.load(recommendHash, null);
if (urlentry != null) { if (urlentry != null) {
plasmaParserDocument document = sb.snippetCache.retrieveDocument(urlentry.url(), true); plasmaCrawlLURLEntry.Components comp = urlentry.comp();
plasmaParserDocument document;
document = sb.snippetCache.retrieveDocument(comp.url(), true);
if (document != null) { if (document != null) {
// create a news message // create a news message
HashMap map = new HashMap(); HashMap map = new HashMap();
map.put("url", urlentry.url().toNormalform().replace(',', '|')); map.put("url", comp.url().toNormalform().replace(',', '|'));
map.put("title", urlentry.descr().replace(',', ' ')); map.put("title", comp.descr().replace(',', ' '));
map.put("description", ((document == null) ? urlentry.descr() : document.getMainLongTitle()).replace(',', ' ')); map.put("description", ((document == null) ? comp.descr() : document.getMainLongTitle()).replace(',', ' '));
map.put("tags", ((document == null) ? "" : document.getKeywords(' '))); map.put("tags", ((document == null) ? "" : document.getKeywords(' ')));
yacyCore.newsPool.publishMyNews(new yacyNewsRecord("stippadd", map)); yacyCore.newsPool.publishMyNews(new yacyNewsRecord("stippadd", map));
document.close(); document.close();

@ -52,6 +52,7 @@ public class indexURL {
public static final int urlStringLength = 256;// not too short for links without parameters public static final int urlStringLength = 256;// not too short for links without parameters
public static final int urlDescrLength = 80; // The headline of a web page (meta-tag or <h1>) public static final int urlDescrLength = 80; // The headline of a web page (meta-tag or <h1>)
public static final int urlNameLength = 40; // the tag content between <a> and </a> public static final int urlNameLength = 40; // the tag content between <a> and </a>
public static final int urldescrtagsLength = 320;// the url, the description and tags in one string
public static final int urlErrorLength = 80; // a reason description for unavailable urls public static final int urlErrorLength = 80; // a reason description for unavailable urls
public static final int urlDateLength = 4; // any date, shortened public static final int urlDateLength = 4; // any date, shortened
public static final int urlCopyCountLength = 2; // counter for numbers of copies of this index public static final int urlCopyCountLength = 2; // counter for numbers of copies of this index

@ -158,7 +158,10 @@ public class kelondroCollectionIndex {
ientry.setCol(idx_col_indexpos, j); ientry.setCol(idx_col_indexpos, j);
ientry.setCol(idx_col_lastread, t); ientry.setCol(idx_col_lastread, t);
ientry.setCol(idx_col_lastwrote, t); ientry.setCol(idx_col_lastwrote, t);
index.put(ientry); if (index instanceof kelondroBufferedIndex)
((kelondroBufferedIndex) index).add(ientry);
else
index.put(ientry);
// write a log // write a log
if (System.currentTimeMillis() - lastlog > 30000) { if (System.currentTimeMillis() - lastlog > 30000) {

@ -40,7 +40,7 @@ public class kelondroRow {
protected kelondroColumn[] row; protected kelondroColumn[] row;
protected int[] colstart; protected int[] colstart;
protected int objectsize; protected int objectsize;
protected Map nickref = null; protected Map nickref = null; // a mapping from nicknames to Object[2]{kelondroColumn, Integer(colstart)}
public kelondroRow(kelondroColumn[] row) { public kelondroRow(kelondroColumn[] row) {
this.row = row; this.row = row;
@ -142,7 +142,12 @@ public class kelondroRow {
if (external == null) return null; if (external == null) return null;
return new Entry(external); return new Entry(external);
} }
/*
public Entry newEntry(Properties prop) {
if (prop == null) return null;
return new Entry(prop);
}
*/
public class Entry implements Comparable { public class Entry implements Comparable {
private byte[] rowinstance; private byte[] rowinstance;
@ -202,7 +207,19 @@ public class kelondroRow {
} }
} }
} }
/*
public Entry(Properties prop) {
// parse external form
if (nickref == null) genNickRef();
rowinstance = new byte[objectsize];
Iterator i = prop.entrySet().iterator();
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
setCol(((String) entry.getKey()).trim(), ((String) entry.getValue()).trim().getBytes());
}
}
*/
public int compareTo(Object o) { public int compareTo(Object o) {
if (o instanceof Entry) { if (o instanceof Entry) {
return kelondroNaturalOrder.naturalOrder.compare(this.rowinstance, ((Entry) o).rowinstance); return kelondroNaturalOrder.naturalOrder.compare(this.rowinstance, ((Entry) o).rowinstance);
@ -354,7 +371,7 @@ public class kelondroRow {
return getColLong(row[column].encoder(), colstart[column], row[column].cellwidth()); return getColLong(row[column].encoder(), colstart[column], row[column].cellwidth());
} }
public long getColLong(int encoder, int offset, int length) { private long getColLong(int encoder, int offset, int length) {
// start - fix for badly stored parameters // start - fix for badly stored parameters
if ((length >= 3) && (rowinstance[offset] == '[') && (rowinstance[offset + 1] == 'B') && (rowinstance[offset + 2] == '@')) return 0; if ((length >= 3) && (rowinstance[offset] == '[') && (rowinstance[offset + 1] == 'B') && (rowinstance[offset + 2] == '@')) return 0;
if ((length == 2) && (rowinstance[offset] == '[') && (rowinstance[offset + 1] == 'B')) return 0; if ((length == 2) && (rowinstance[offset] == '[') && (rowinstance[offset + 1] == 'B')) return 0;
@ -378,6 +395,13 @@ public class kelondroRow {
throw new kelondroException("ROW", "getColLong did not find appropriate encoding"); throw new kelondroException("ROW", "getColLong did not find appropriate encoding");
} }
public byte getColByte(String nickname, byte dflt) {
if (nickref == null) genNickRef();
Object[] ref = (Object[]) nickref.get(nickname);
if (ref == null) return dflt;
return rowinstance[((Integer) ref[1]).intValue()];
}
public byte getColByte(int column) { public byte getColByte(int column) {
return rowinstance[colstart[column]]; return rowinstance[colstart[column]];
} }

@ -195,7 +195,7 @@ public final class plasmaCrawlLURL extends indexURL {
} }
} }
public synchronized plasmaCrawlLURLEntry newEntry(URL url, String descr, Date moddate, Date loaddate, public synchronized plasmaCrawlLURLEntry newEntry(String url, String descr, Date moddate, Date loaddate,
String referrerHash, int copyCount, boolean localNeed, String referrerHash, int copyCount, boolean localNeed,
int quality, String language, char doctype, int quality, String language, char doctype,
int size, int wordCount) { int size, int wordCount) {
@ -338,7 +338,6 @@ public final class plasmaCrawlLURL extends indexURL {
String cachepath, urlstr, urltxt; String cachepath, urlstr, urltxt;
yacySeed initiatorSeed, executorSeed; yacySeed initiatorSeed, executorSeed;
plasmaCrawlLURLEntry urle; plasmaCrawlLURLEntry urle;
URL url;
// needed for getCachePath(url) // needed for getCachePath(url)
final plasmaSwitchboard switchboard = plasmaSwitchboard.getSwitchboard(); final plasmaSwitchboard switchboard = plasmaSwitchboard.getSwitchboard();
@ -353,14 +352,14 @@ public final class plasmaCrawlLURL extends indexURL {
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash);
try { try {
urle = load(urlHash, null); urle = load(urlHash, null);
plasmaCrawlLURLEntry.Components comp = urle.comp();
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString()); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString());
initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash); initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash);
executorSeed = yacyCore.seedDB.getConnected(executorHash); executorSeed = yacyCore.seedDB.getConnected(executorHash);
url = urle.url(); urlstr = comp.url().toNormalform();
urlstr = url.toString();
urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
cachepath = (url == null) ? "-not-cached-" : cacheManager.getCachePath(url).toString().replace('\\', '/').substring(cacheManager.cachePath.toString().length() + 1); cachepath = cacheManager.getCachePath(new URL(urlstr)).toString().replace('\\', '/').substring(cacheManager.cachePath.toString().length() + 1);
prop.put("table_indexed_" + cnt + "_dark", (dark) ? 1 : 0); prop.put("table_indexed_" + cnt + "_dark", (dark) ? 1 : 0);
prop.put("table_indexed_" + cnt + "_feedbackpage", feedbackpage); prop.put("table_indexed_" + cnt + "_feedbackpage", feedbackpage);
@ -372,8 +371,8 @@ public final class plasmaCrawlLURL extends indexURL {
prop.put("table_indexed_" + cnt + "_showExec_executorSeed", (executorSeed == null) ? dfltExec : executorSeed.getName()); prop.put("table_indexed_" + cnt + "_showExec_executorSeed", (executorSeed == null) ? dfltExec : executorSeed.getName());
prop.put("table_indexed_" + cnt + "_moddate", daydate(urle.moddate())); prop.put("table_indexed_" + cnt + "_moddate", daydate(urle.moddate()));
prop.put("table_indexed_" + cnt + "_wordcount", urle.wordCount()); prop.put("table_indexed_" + cnt + "_wordcount", urle.wordCount());
prop.put("table_indexed_" + cnt + "_urldescr", urle.descr()); prop.put("table_indexed_" + cnt + "_urldescr", comp.descr());
prop.put("table_indexed_" + cnt + "_url", (urle.url() == null) ? "-not-cached-" : ((makeLink) ? ("<a href=\"CacheAdmin_p.html?action=info&path=" + cachepath + "\" class=\"small\" title=\"" + urlstr + "\">" + urltxt + "</a>") : urlstr)); prop.put("table_indexed_" + cnt + "_url", (cachepath == null) ? "-not-cached-" : ((makeLink) ? ("<a href=\"CacheAdmin_p.html?action=info&path=" + cachepath + "\" class=\"small\" title=\"" + urlstr + "\">" + urltxt + "</a>") : urlstr));
dark = !dark; dark = !dark;
cnt++; cnt++;
} catch (Exception e) { } catch (Exception e) {
@ -535,18 +534,19 @@ public final class plasmaCrawlLURL extends indexURL {
} }
plasmaCrawlLURLEntry entry = (plasmaCrawlLURLEntry) eiter.next(); plasmaCrawlLURLEntry entry = (plasmaCrawlLURLEntry) eiter.next();
plasmaCrawlLURLEntry.Components comp = entry.comp();
totalSearchedUrls++; totalSearchedUrls++;
if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, entry.url()) || if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, comp.url()) ||
plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, entry.url())) { plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, comp.url())) {
lastBlacklistedUrl = entry.url().toString(); lastBlacklistedUrl = comp.url().toNormalform();
lastBlacklistedHash = entry.hash(); lastBlacklistedHash = entry.hash();
serverLog.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double)blacklistedUrls/totalSearchedUrls)*100 + "%): " + entry.hash() + " " + entry.url()); serverLog.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double)blacklistedUrls/totalSearchedUrls)*100 + "%): " + entry.hash() + " " + comp.url().toNormalform());
remove(entry.hash()); remove(entry.hash());
if (blacklistedUrls % 100 == 0) { if (blacklistedUrls % 100 == 0) {
serverLog.logInfo("URLDBCLEANER", "Deleted " + blacklistedUrls + " URLs until now. Last deleted URL-Hash: " + lastBlacklistedUrl); serverLog.logInfo("URLDBCLEANER", "Deleted " + blacklistedUrls + " URLs until now. Last deleted URL-Hash: " + lastBlacklistedUrl);
} }
} }
lastUrl = entry.url().toString(); lastUrl = comp.url().toNormalform();
lastHash = entry.hash(); lastHash = entry.hash();
} }
} catch (RuntimeException e) { } catch (RuntimeException e) {
@ -605,7 +605,7 @@ public final class plasmaCrawlLURL extends indexURL {
final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), 1, 0, false); final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), 1, 0, false);
final Iterator enu = urls.entries(true, false, null); final Iterator enu = urls.entries(true, false, null);
while (enu.hasNext()) { while (enu.hasNext()) {
((plasmaCrawlLURLEntry) enu.next()).print(); System.out.println(((plasmaCrawlLURLEntry) enu.next()).toString());
} }
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();

@ -27,10 +27,11 @@
package de.anomic.plasma; package de.anomic.plasma;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date; import java.util.Date;
import de.anomic.net.URL;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.index.indexEntry; import de.anomic.index.indexEntry;
public interface plasmaCrawlLURLEntry { public interface plasmaCrawlLURLEntry {
@ -39,9 +40,7 @@ public interface plasmaCrawlLURLEntry {
public String hash(); public String hash();
public URL url(); public Components comp();
public String descr();
public Date moddate(); public Date moddate();
@ -51,12 +50,6 @@ public interface plasmaCrawlLURLEntry {
public char doctype(); public char doctype();
public int copyCount();
public boolean local();
public int quality();
public String language(); public String language();
public int size(); public int size();
@ -73,6 +66,26 @@ public interface plasmaCrawlLURLEntry {
public String toString(); public String toString();
public void print(); public class Components {
private URL url;
private String descr, author, tags, ETag;
public Components(String url, String descr, String author, String tags, String ETag) {
try {
this.url = new URL(url);
} catch (MalformedURLException e) {
this.url = null;
}
this.descr = descr;
this.author = author;
this.tags = tags;
this.ETag = ETag;
}
public URL url() { return this.url; }
public String descr() { return this.descr; }
public String author() { return this.author; }
public String tags() { return this.tags; }
public String ETag() { return this.ETag; }
}
} }

@ -0,0 +1,337 @@
package de.anomic.plasma;
import java.io.IOException;
import java.net.MalformedURLException;
import java.text.ParseException;
import java.util.Date;
import java.util.Properties;
import java.util.ArrayList;
import de.anomic.index.indexEntry;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverCodings;
import de.anomic.tools.crypt;
import de.anomic.tools.bitfield;
import de.anomic.tools.nxTools;
public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
public static final kelondroRow rowdef = new kelondroRow(
"String hash-12, " + // the url's hash
"String comp-360, " + // components: the url, description, author and tags. As 5th element, an ETag is possible
"Cardinal mod-4 {b256}, " + // last-modified from the httpd
"Cardinal load-4 {b256}, " + // time when the url was loaded
"String referrer-12, " + // (one of) the url's referrer hash(es)
"byte[] md5-8" + // the md5 of the url content (to identify changes)
"Cardinal size-6 {b256}, " + // size of file in bytes
"Cardinal wc-3 {b256}, " + // size of file by number of words; for video and audio: seconds
"byte[] dt-1, " + // doctype, taken from extension or any other heuristic
"byte[] flags-4, " + // flags; any stuff (see Word-Entity definition)
"String lang-2, " + // language
"Cardinal llocal-2 {b256}, " + // # of outlinks to same domain; for video and image: width
"Cardinal lother-2 {b256}, " + // # of outlinks to outside domain; for video and image: height
"Cardinal limage-2 {b256}, " + // # of embedded image links
"Cardinal laudio-2 {b256}, " + // # of embedded audio links; for audio: track number; for video: number of audio tracks
"Cardinal lvideo-2 {b256}, " + // # of embedded video links
"Cardinal lapp-2 {b256}"); // # of embedded links to applications
private kelondroRow.Entry entry;
private String snippet;
private indexEntry word; // this is only used if the url is transported via remote search requests
public plasmaCrawlLURLNewEntry(
URL url,
String descr,
String author,
String tags,
String ETag,
Date mod,
Date load,
String referrer,
byte[] md5,
long size,
int wc,
byte dt,
bitfield flags,
String lang,
int llocal,
int lother,
int laudio,
int limage,
int lvideo,
int lapp) {
// create new entry and store it into database
this.entry = rowdef.newEntry();
this.entry.setCol("hash", indexURL.urlHash(url), null);
this.entry.setCol("comp", encodeComp(url, descr, author, tags, ETag));
this.entry.setCol("mod", encodeDate(mod));
this.entry.setCol("load", encodeDate(load));
this.entry.setCol("referrer", referrer.getBytes());
this.entry.setCol("md5", md5);
this.entry.setCol("size", size);
this.entry.setCol("wc", wc);
this.entry.setCol("dt", dt);
this.entry.setCol("flags", flags.getBytes());
this.entry.setCol("lang", lang.getBytes());
this.entry.setCol("llocal", llocal);
this.entry.setCol("lother", lother);
this.entry.setCol("limage", limage);
this.entry.setCol("laudio", laudio);
this.entry.setCol("lvideo", lvideo);
this.entry.setCol("lapp", lapp);
this.snippet = null;
this.word = null;
}
byte[] encodeDate(Date d) {
return kelondroNaturalOrder.encodeLong(d.getTime() / 86400000, 4);
}
byte[] encodeComp(URL url, String descr, String author, String tags, String ETag) {
serverByteBuffer s = new serverByteBuffer(200);
s.append(url.toNormalform()).append((char) 10);
s.append(author).append((char) 10);
s.append(tags).append((char) 10);
s.append(ETag).append((char) 10);
return s.getBytes();
}
public plasmaCrawlLURLNewEntry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException {
this.entry = entry;
this.snippet = null;
this.word = searchedWord;
}
public plasmaCrawlLURLNewEntry(Properties prop, boolean setGlobal) throws IOException {
// generates an plasmaLURLEntry using the properties from the argument
// the property names must correspond to the one from toString
//System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
URL url;
try {
url = new URL(crypt.simpleDecode(prop.getProperty("url", ""), null));
} catch (MalformedURLException e) {
throw new IOException("URL is not proper: " + crypt.simpleDecode(prop.getProperty("url", ""), null));
}
String descr = crypt.simpleDecode(prop.getProperty("descr", ""), null); if (descr == null) descr = "";
String author = crypt.simpleDecode(prop.getProperty("author", ""), null); if (author == null) author = "";
String tags = crypt.simpleDecode(prop.getProperty("tags", ""), null); if (tags == null) tags = "";
String ETag = crypt.simpleDecode(prop.getProperty("ETag", ""), null); if (ETag == null) ETag = "";
this.entry = rowdef.newEntry();
this.entry.setCol("hash", indexURL.urlHash(url), null);
this.entry.setCol("comp", encodeComp(url, descr, author, tags, ETag));
try {
this.entry.setCol("mod", encodeDate(indexURL.shortDayFormatter.parse(prop.getProperty("mod", "20000101"))));
} catch (ParseException e) {
this.entry.setCol("mod", encodeDate(new Date()));
}
try {
this.entry.setCol("load", encodeDate(indexURL.shortDayFormatter.parse(prop.getProperty("load", "20000101"))));
} catch (ParseException e) {
this.entry.setCol("load", encodeDate(new Date()));
}
this.entry.setCol("referrer", prop.getProperty("referrer", indexURL.dummyHash).getBytes());
this.entry.setCol("md5", serverCodings.decodeHex(prop.getProperty("md5", indexURL.dummyHash)));
this.entry.setCol("size", Integer.parseInt(prop.getProperty("size", "0")));
this.entry.setCol("wc", Integer.parseInt(prop.getProperty("wc", "0")));
this.entry.setCol("dt", prop.getProperty("dt", "t").charAt(0));
this.entry.setCol("flags", serverCodings.decodeHex(prop.getProperty("flags", "00000000")));
this.entry.setCol("lang", prop.getProperty("lang", "uk").getBytes());
this.entry.setCol("llocal", Integer.parseInt(prop.getProperty("llocal", "0")));
this.entry.setCol("lother", Integer.parseInt(prop.getProperty("lother", "0")));
this.entry.setCol("limage", Integer.parseInt(prop.getProperty("limage", "0")));
this.entry.setCol("laudio", Integer.parseInt(prop.getProperty("laudio", "0")));
this.entry.setCol("lvideo", Integer.parseInt(prop.getProperty("lvideo", "0")));
this.entry.setCol("lapp", Integer.parseInt(prop.getProperty("lapp", "0")));
this.snippet = crypt.simpleDecode(prop.getProperty("snippet", ""), null);
this.word = (prop.containsKey("word")) ? new indexURLEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))) : null;
}
private StringBuffer corePropList() {
// generate a parseable string; this is a simple property-list
plasmaCrawlLURLEntry.Components comp = this.comp();
final StringBuffer s = new StringBuffer(300);
try {
s.append("hash=").append(hash());
s.append(",url=").append(crypt.simpleEncode(comp.url().toNormalform()));
s.append(",descr=").append(crypt.simpleEncode(comp.descr()));
s.append(",author=").append(crypt.simpleEncode(comp.author()));
s.append(",tags=").append(crypt.simpleEncode(comp.tags()));
s.append(",ETag=").append(crypt.simpleEncode(comp.ETag()));
s.append(",mod=").append(indexURL.shortDayFormatter.format(moddate()));
s.append(",load=").append(indexURL.shortDayFormatter.format(loaddate()));
s.append(",referrer=").append(referrerHash());
s.append(",md5=").append(md5());
s.append(",size=").append(size());
s.append(",wc=").append(wordCount());
s.append(",dt=").append(doctype());
s.append(",flags=").append(serverCodings.encodeHex(flags().getBytes()));
s.append(",lang=").append(language());
s.append(",llocal=").append(llocal());
s.append(",lother=").append(lother());
s.append(",limage=").append(limage());
s.append(",laudio=").append(laudio());
s.append(",lvideo=").append(lvideo());
s.append(",lapp=").append(lapp());
if (this.word != null) {
// append also word properties
s.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm(false)));
}
return s;
} catch (Exception e) {
// serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
// if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
// if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null");
// e.printStackTrace();
return null;
}
}
public kelondroRow.Entry toRowEntry() throws IOException {
return this.entry;
}
public String hash() {
// return a url-hash, based on the md5 algorithm
// the result is a String of 12 bytes within a 72-bit space
// (each byte has an 6-bit range)
// that should be enough for all web pages on the world
return this.entry.getColString("hash", "", null);
}
public de.anomic.plasma.plasmaCrawlLURLEntry.Components comp() {
ArrayList cl = nxTools.strings(this.entry.getCol("comp", null), "UTF-8");
return new de.anomic.plasma.plasmaCrawlLURLEntry.Components(
(cl.size() > 0) ? (String) cl.get(0) : "",
(cl.size() > 1) ? (String) cl.get(1) : "",
(cl.size() > 2) ? (String) cl.get(2) : "",
(cl.size() > 3) ? (String) cl.get(3) : "",
(cl.size() > 4) ? (String) cl.get(4) : "");
}
public Date moddate() {
return new Date(86400000 * entry.getColLong("mod", 0));
}
public Date loaddate() {
return new Date(86400000 * entry.getColLong("load", 0));
}
public String referrerHash() {
// return the creator's hash
return entry.getColString("referrer", indexURL.dummyHash, null);
}
public String md5() {
// returns the md5 in hex representation
return serverCodings.encodeHex(entry.getCol("md5", indexURL.dummyHash.getBytes()));
}
public char doctype() {
return (char) entry.getColByte("dt", (byte) 't');
}
public String language() {
return this.entry.getColString("lang", "uk", null);
}
public int size() {
return (int) this.entry.getColLong("size", 0);
}
public bitfield flags() {
return new bitfield(this.entry.getCol("flags", new byte[4]));
}
public int wordCount() {
return (int) this.entry.getColLong("wc", 0);
}
public int llocal() {
return (int) this.entry.getColLong("llocal", 0);
}
public int lother() {
return (int) this.entry.getColLong("lother", 0);
}
public int limage() {
return (int) this.entry.getColLong("limage", 0);
}
public int laudio() {
return (int) this.entry.getColLong("laudio", 0);
}
public int lvideo() {
return (int) this.entry.getColLong("lvideo", 0);
}
public int lapp() {
return (int) this.entry.getColLong("lapp", 0);
}
public String snippet() {
// the snippet may appear here if the url was transported in a remote search
// it will not be saved anywhere, but can only be requested here
return snippet;
}
public indexEntry word() {
return word;
}
public boolean isOlder(plasmaCrawlLURLEntry other) {
if (other == null) return false;
Date tmoddate = moddate();
Date omoddate = other.moddate();
if (tmoddate.before(omoddate)) return true;
if (tmoddate.equals(omoddate)) {
Date tloaddate = loaddate();
Date oloaddate = other.loaddate();
if (tloaddate.before(oloaddate)) return true;
if (tloaddate.equals(oloaddate)) return true;
}
return false;
}
public String toString(String snippet) {
// add information needed for remote transport
final StringBuffer core = corePropList();
if (core == null)
return null;
core.ensureCapacity(core.length() + snippet.length() * 2);
core.insert(0, "{");
core.append(",snippet=").append(crypt.simpleEncode(snippet));
core.append("}");
return core.toString();
//return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
}
/**
* Returns this object as String.<br>
* This e.g. looks like this:
* <pre>{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}</pre>
*/
public String toString() {
final StringBuffer core = corePropList();
if (core == null) return null;
core.insert(0, "{");
core.append("}");
return core.toString();
//return "{" + core + "}";
}
}

@ -36,7 +36,6 @@ import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry; import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.tools.crypt; import de.anomic.tools.crypt;
@ -57,7 +56,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
"Cardinal size-" + indexURL.urlSizeLength + " {b64e}, " + // size of file in bytes "Cardinal size-" + indexURL.urlSizeLength + " {b64e}, " + // size of file in bytes
"Cardinal wc-" + indexURL.urlWordCountLength + " {b64e}"); // word count "Cardinal wc-" + indexURL.urlWordCountLength + " {b64e}"); // word count
private URL url; private String url;
private String descr; private String descr;
private Date moddate; private Date moddate;
private Date loaddate; private Date loaddate;
@ -73,19 +72,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
private String snippet; private String snippet;
private indexEntry word; // this is only used if the url is transported via remote search requests private indexEntry word; // this is only used if the url is transported via remote search requests
// more needed attributes: public plasmaCrawlLURLOldEntry(String url, String descr, Date moddate,
// - author / copyright owner
// - keywords
// - phrasecount, total number of phrases
// - boolean: URL attributes (see Word-Entity definition)
// - boolean: appearance of bold and/or italics
// - ETag: for re-crawl decision upon HEAD request
// - int: # of outlinks to same domain
// - int: # of outlinks to outside domain
// - int: # of keywords
// - int: # der auf der Seite vorhandenen Links zu image, audio, video, applications
public plasmaCrawlLURLOldEntry(URL url, String descr, Date moddate,
Date loaddate, String referrerHash, int copyCount, Date loaddate, String referrerHash, int copyCount,
boolean localNeed, int quality, String language, char doctype, boolean localNeed, int quality, String language, char doctype,
int size, int wordCount) { int size, int wordCount) {
@ -110,7 +97,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
public plasmaCrawlLURLOldEntry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException { public plasmaCrawlLURLOldEntry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException {
try { try {
this.urlHash = entry.getColString(0, null); this.urlHash = entry.getColString(0, null);
this.url = new URL(entry.getColString(1, "UTF-8").trim()); this.url = entry.getColString(1, "UTF-8").trim();
this.descr = (entry.empty(2)) ? this.url.toString() : entry.getColString(2, "UTF-8").trim(); this.descr = (entry.empty(2)) ? this.url.toString() : entry.getColString(2, "UTF-8").trim();
this.moddate = new Date(86400000 * entry.getColLong(3)); this.moddate = new Date(86400000 * entry.getColLong(3));
this.loaddate = new Date(86400000 * entry.getColLong(4)); this.loaddate = new Date(86400000 * entry.getColLong(4));
@ -144,7 +131,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
this.copyCount = Integer.parseInt(prop.getProperty("cc", "0")); this.copyCount = Integer.parseInt(prop.getProperty("cc", "0"));
this.flags = ((prop.getProperty("local", "true").equals("true")) ? "L " : " "); this.flags = ((prop.getProperty("local", "true").equals("true")) ? "L " : " ");
if (setGlobal) this.flags = "G "; if (setGlobal) this.flags = "G ";
this.url = new URL(crypt.simpleDecode(prop.getProperty("url", ""), null)); this.url = crypt.simpleDecode(prop.getProperty("url", ""), null);
this.descr = crypt.simpleDecode(prop.getProperty("descr", ""), null); this.descr = crypt.simpleDecode(prop.getProperty("descr", ""), null);
if (this.descr == null) this.descr = this.url.toString(); if (this.descr == null) this.descr = this.url.toString();
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(prop.getProperty("q", "")); this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(prop.getProperty("q", ""));
@ -195,13 +182,9 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
// that should be enough for all web pages on the world // that should be enough for all web pages on the world
return this.urlHash; return this.urlHash;
} }
public URL url() { public Components comp() {
return url; return new Components(url, descr, "", "", "");
}
public String descr() {
return descr;
} }
public Date moddate() { public Date moddate() {
@ -263,9 +246,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
if (moddate.before(other.moddate())) return true; if (moddate.before(other.moddate())) return true;
if (moddate.equals(other.moddate())) { if (moddate.equals(other.moddate())) {
if (loaddate.before(other.loaddate())) return true; if (loaddate.before(other.loaddate())) return true;
if (loaddate.equals(other.loaddate())) { if (loaddate.equals(other.loaddate())) return true;
if (quality < other.quality()) return true;
}
} }
return false; return false;
} }
@ -297,30 +278,10 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
return corePropStr; return corePropStr;
} catch (Exception e) { } catch (Exception e) {
// serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
// if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
// if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null");
// e.printStackTrace();
return null; return null;
} }
} }
/*
public String toString(int posintext, int posinphrase, int posofphrase) {
// add information needed for remote transport
final StringBuffer core = corePropList();
if (core == null) return null;
core.ensureCapacity(core.length() + 200);
core.insert(0,"{")
.append(",posintext=").append(posintext)
.append(",posinphrase=").append(posinphrase)
.append(",posofphraseint=").append(posofphrase)
.append("}");
return core.toString();
}
*/
public String toString(String snippet) { public String toString(String snippet) {
// add information needed for remote transport // add information needed for remote transport
final StringBuffer core = corePropList(); final StringBuffer core = corePropList();

@ -227,7 +227,7 @@ public class plasmaDHTChunk {
while ((urlIter.hasNext()) && (maxcount > refcount) && (System.currentTimeMillis() < timeout)) { while ((urlIter.hasNext()) && (maxcount > refcount) && (System.currentTimeMillis() < timeout)) {
iEntry = (indexEntry) urlIter.next(); iEntry = (indexEntry) urlIter.next();
lurl = lurls.load(iEntry.urlHash(), iEntry); lurl = lurls.load(iEntry.urlHash(), iEntry);
if ((lurl == null) || (lurl.url() == null)) { if ((lurl == null) || (lurl.comp().url() == null)) {
//yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash()); //yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash());
notBoundCounter++; notBoundCounter++;
urlIter.remove(); urlIter.remove();

@ -104,7 +104,7 @@ public final class plasmaSearchImages {
plasmaCrawlLURLEntry urlentry; plasmaCrawlLURLEntry urlentry;
while (sres.hasMoreElements()) { while (sres.hasMoreElements()) {
urlentry = sres.nextElement(); urlentry = sres.nextElement();
addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), urlentry.url(), depth)); addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), urlentry.comp().url(), depth));
} }
} }

@ -197,8 +197,9 @@ public class plasmaSearchRankingProfile {
long ranking = preranking; long ranking = preranking;
// prefer hit with 'prefer' pattern // prefer hit with 'prefer' pattern
if (page.url().toString().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue(); plasmaCrawlLURLEntry.Components comp = page.comp();
if (page.descr().toString().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue(); if (comp.url().toNormalform().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();
if (comp.descr().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();
// apply 'common-sense' heuristic using references // apply 'common-sense' heuristic using references
for (int j = 0; j < urlcomps.length; j++) { for (int j = 0; j < urlcomps.length; j++) {
@ -220,11 +221,11 @@ public class plasmaSearchRankingProfile {
} }
// prefer short urls // prefer short urls
ranking += (256 - page.url().toString().length()) << ((Integer) coeff.get(URLLENGTH)).intValue(); ranking += (256 - comp.url().toNormalform().length()) << ((Integer) coeff.get(URLLENGTH)).intValue();
ranking += (8 * Math.max(0, 32 - urlcomps.length)) << ((Integer) coeff.get(URLCOMPS)).intValue(); ranking += (8 * Math.max(0, 32 - urlcomps.length)) << ((Integer) coeff.get(URLCOMPS)).intValue();
// prefer long descriptions // prefer long descriptions
ranking += (256 * page.descr().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue(); ranking += (256 * comp.url().toNormalform().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue();
ranking += (256 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue(); ranking += (256 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue();
return ranking; return ranking;

@ -108,11 +108,10 @@ public final class plasmaSearchResult {
protected void addResult(plasmaCrawlLURLEntry page, Long preranking) { protected void addResult(plasmaCrawlLURLEntry page, Long preranking) {
// take out relevant information for reference computation // take out relevant information for reference computation
URL url = page.url(); plasmaCrawlLURLEntry.Components comp = page.comp();
String descr = page.descr(); if ((comp.url() == null) || (comp.descr() == null)) return;
if ((url == null) || (descr == null)) return; String[] urlcomps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()); // word components of the url
String[] urlcomps = htmlFilterContentScraper.urlComps(url.toString()); // word components of the url String[] descrcomps = comp.descr().toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description
String[] descrcomps = descr.toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description
// store everything // store everything
results.add(new Object[] {page, urlcomps, descrcomps, preranking}); results.add(new Object[] {page, urlcomps, descrcomps, preranking});
@ -168,12 +167,12 @@ public final class plasmaSearchResult {
Iterator i = pageAcc.entrySet().iterator(); Iterator i = pageAcc.entrySet().iterator();
HashMap paths = new HashMap(); // a url-subpath to pageAcc-key relation HashMap paths = new HashMap(); // a url-subpath to pageAcc-key relation
Map.Entry entry; Map.Entry entry;
String path; String path = null;
// first scan all entries and find all urls that are referenced // first scan all entries and find all urls that are referenced
while (i.hasNext()) { while (i.hasNext()) {
entry = (Map.Entry) i.next(); entry = (Map.Entry) i.next();
path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).url()); path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).comp().url());
paths.put(path, entry.getKey()); paths.put(path, entry.getKey());
//if (path != null) path = shortenPath(path); //if (path != null) path = shortenPath(path);
//if (path != null) paths.put(path, entry.getKey()); //if (path != null) paths.put(path, entry.getKey());
@ -184,7 +183,7 @@ public final class plasmaSearchResult {
String shorten; String shorten;
while (i.hasNext()) { while (i.hasNext()) {
entry = (Map.Entry) i.next(); entry = (Map.Entry) i.next();
path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).url()); path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).comp().url());
shorten = shortenPath(path); shorten = shortenPath(path);
// scan all subpaths of the url // scan all subpaths of the url
while (shorten != null) { while (shorten != null) {

@ -289,6 +289,7 @@ public class plasmaSnippetCache {
* @return the parsed document as {@link plasmaParserDocument} * @return the parsed document as {@link plasmaParserDocument}
*/ */
public plasmaParserDocument retrieveDocument(URL url, boolean fetchOnline) { public plasmaParserDocument retrieveDocument(URL url, boolean fetchOnline) {
if (url == null) return null;
IResourceInfo docInfo = null; IResourceInfo docInfo = null;
try { try {
// trying to load the resource body from cache // trying to load the resource body from cache
@ -634,11 +635,12 @@ public class plasmaSnippetCache {
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) { while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) {
urlentry = acc.nextElement(); urlentry = acc.nextElement();
if (urlentry.url().getHost().endsWith(".yacyh")) continue; plasmaCrawlLURLEntry.Components comp = urlentry.comp();
urlstring = urlentry.url().toNormalform(); if (comp.url().getHost().endsWith(".yacyh")) continue;
urlstring = comp.url().toNormalform();
if ((urlstring.matches(urlmask)) && if ((urlstring.matches(urlmask)) &&
(!(existsInCache(urlentry.url(), queryhashes)))) { (!(existsInCache(comp.url(), queryhashes)))) {
new Fetcher(urlentry.url(), queryhashes, (int) maxTime).start(); new Fetcher(comp.url(), queryhashes, (int) maxTime).start();
i++; i++;
} }
} }

@ -1559,7 +1559,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// create a new loaded URL db entry // create a new loaded URL db entry
plasmaCrawlLURLEntry newEntry = urlPool.loadedURL.newEntry( plasmaCrawlLURLEntry newEntry = urlPool.loadedURL.newEntry(
entry.url(), // URL entry.url().toNormalform(), // URL
docDescription, // document description docDescription, // document description
docDate, // modification date docDate, // modification date
new Date(), // loaded date new Date(), // loaded date
@ -1641,8 +1641,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String language = indexEntryAttribute.language(entry.url()); String language = indexEntryAttribute.language(entry.url());
char doctype = indexEntryAttribute.docType(document.getMimeType()); char doctype = indexEntryAttribute.docType(document.getMimeType());
int urlLength = newEntry.url().toString().length(); plasmaCrawlLURLEntry.Components comp = newEntry.comp();
int urlComps = htmlFilterContentScraper.urlComps(newEntry.url().toString()).length; int urlLength = comp.url().toNormalform().length();
int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()).length;
// iterate over all words // iterate over all words
Iterator i = condenser.words(); Iterator i = condenser.words();
@ -2046,10 +2047,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
prop.put("type_globalresults", acc.globalContributions); prop.put("type_globalresults", acc.globalContributions);
int i = 0; int i = 0;
int p; int p;
URL url;
plasmaCrawlLURLEntry urlentry; plasmaCrawlLURLEntry urlentry;
String urlstring, urlname, filename, urlhash; String urlstring, urlname, filename, urlhash;
String host, hash, address, descr = ""; String host, hash, address;
yacySeed seed; yacySeed seed;
plasmaSnippetCache.Snippet snippet; plasmaSnippetCache.Snippet snippet;
boolean includeSnippets = false; boolean includeSnippets = false;
@ -2058,30 +2058,29 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (targetTime < System.currentTimeMillis()) targetTime = System.currentTimeMillis() + 1000; if (targetTime < System.currentTimeMillis()) targetTime = System.currentTimeMillis() + 1000;
while ((acc.hasMoreElements()) && (i < query.wantedResults) && (System.currentTimeMillis() < targetTime)) { while ((acc.hasMoreElements()) && (i < query.wantedResults) && (System.currentTimeMillis() < targetTime)) {
urlentry = acc.nextElement(); urlentry = acc.nextElement();
url = urlentry.url(); plasmaCrawlLURLEntry.Components comp = urlentry.comp();
urlhash = urlentry.hash(); urlhash = urlentry.hash();
host = url.getHost(); host = comp.url().getHost();
if (host.endsWith(".yacyh")) { if (host.endsWith(".yacyh")) {
// translate host into current IP // translate host into current IP
p = host.indexOf("."); p = host.indexOf(".");
hash = yacySeed.hexHash2b64Hash(host.substring(p + 1, host.length() - 6)); hash = yacySeed.hexHash2b64Hash(host.substring(p + 1, host.length() - 6));
seed = yacyCore.seedDB.getConnected(hash); seed = yacyCore.seedDB.getConnected(hash);
filename = url.getFile(); filename = comp.url().getFile();
if ((seed == null) || ((address = seed.getAddress()) == null)) { if ((seed == null) || ((address = seed.getAddress()) == null)) {
// seed is not known from here // seed is not known from here
removeReferences(urlentry.hash(), plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + urlentry.descr()).getBytes())); removeReferences(urlentry.hash(), plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes()));
urlPool.loadedURL.remove(urlentry.hash()); // clean up urlPool.loadedURL.remove(urlentry.hash()); // clean up
continue; // next result continue; // next result
} }
url = new URL("http://" + address + "/" + host.substring(0, p) + filename);
urlname = "http://share." + seed.getName() + ".yacy" + filename; urlname = "http://share." + seed.getName() + ".yacy" + filename;
if ((p = urlname.indexOf("?")) > 0) urlname = urlname.substring(0, p); if ((p = urlname.indexOf("?")) > 0) urlname = urlname.substring(0, p);
urlstring = url.toNormalform(); urlstring = "http://" + address + "/" + host.substring(0, p) + filename;
} else { } else {
urlstring = url.toNormalform(); urlstring = comp.url().toNormalform();
urlname = urlstring; urlname = urlstring;
} }
descr = urlentry.descr();
// check bluelist again: filter out all links where any bluelisted word // check bluelist again: filter out all links where any bluelisted word
// appear either in url, url's description or search word // appear either in url, url's description or search word
@ -2097,7 +2096,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
URL wordURL; URL wordURL;
if (urlstring.matches(query.urlMask)) { //.* is default if (urlstring.matches(query.urlMask)) { //.* is default
if (includeSnippets) { if (includeSnippets) {
snippet = snippetCache.retrieveSnippet(url, query.queryHashes, false, 260, 1000); snippet = snippetCache.retrieveSnippet(comp.url(), query.queryHashes, false, 260, 1000);
} else { } else {
snippet = null; snippet = null;
} }
@ -2107,7 +2106,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
prop.put("type_results_" + i + "_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, "stippadd", "url", urlstring) == null) ? 1 : 0); prop.put("type_results_" + i + "_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, "stippadd", "url", urlstring) == null) ? 1 : 0);
prop.put("type_results_" + i + "_recommend_deletelink", "/yacysearch.html?search=" + formerSearch + "&Enter=Search&count=" + query.wantedResults + "&order=" + ranking.orderString() + "&resource=local&time=3&deleteref=" + urlhash + "&urlmaskfilter=.*"); prop.put("type_results_" + i + "_recommend_deletelink", "/yacysearch.html?search=" + formerSearch + "&Enter=Search&count=" + query.wantedResults + "&order=" + ranking.orderString() + "&resource=local&time=3&deleteref=" + urlhash + "&urlmaskfilter=.*");
prop.put("type_results_" + i + "_recommend_recommendlink", "/yacysearch.html?search=" + formerSearch + "&Enter=Search&count=" + query.wantedResults + "&order=" + ranking.orderString() + "&resource=local&time=3&recommendref=" + urlhash + "&urlmaskfilter=.*"); prop.put("type_results_" + i + "_recommend_recommendlink", "/yacysearch.html?search=" + formerSearch + "&Enter=Search&count=" + query.wantedResults + "&order=" + ranking.orderString() + "&resource=local&time=3&recommendref=" + urlhash + "&urlmaskfilter=.*");
prop.put("type_results_" + i + "_description", descr); prop.put("type_results_" + i + "_description", comp.descr());
prop.put("type_results_" + i + "_url", urlstring); prop.put("type_results_" + i + "_url", urlstring);
prop.put("type_results_" + i + "_urlhash", urlhash); prop.put("type_results_" + i + "_urlhash", urlhash);
prop.put("type_results_" + i + "_urlhexhash", yacySeed.b64Hash2hexHash(urlhash)); prop.put("type_results_" + i + "_urlhexhash", yacySeed.b64Hash2hexHash(urlhash));
@ -2196,19 +2195,18 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// determine the url string // determine the url string
plasmaCrawlLURLEntry entry = urlPool.loadedURL.load(urlhash, null); plasmaCrawlLURLEntry entry = urlPool.loadedURL.load(urlhash, null);
if (entry == null) return 0; if (entry == null) return 0;
plasmaCrawlLURLEntry.Components comp = entry.comp();
URL url = entry.url(); if (comp.url() == null) return 0;
if (url == null) return 0;
InputStream resourceContent = null; InputStream resourceContent = null;
try { try {
// get the resource content // get the resource content
Object[] resource = snippetCache.getResource(url, fetchOnline, 10000); Object[] resource = snippetCache.getResource(comp.url(), fetchOnline, 10000);
resourceContent = (InputStream) resource[0]; resourceContent = (InputStream) resource[0];
Long resourceContentLength = (Long) resource[1]; Long resourceContentLength = (Long) resource[1];
// parse the resource // parse the resource
plasmaParserDocument document = snippetCache.parseDocument(url, resourceContentLength.longValue(), resourceContent); plasmaParserDocument document = snippetCache.parseDocument(comp.url(), resourceContentLength.longValue(), resourceContent);
// getting parsed body input stream // getting parsed body input stream
InputStream docBodyInputStream = document.getText(); InputStream docBodyInputStream = document.getText();

@ -334,7 +334,7 @@ public class plasmaSwitchboardQueue {
if (referrerURL == null) { if (referrerURL == null) {
if ((referrerHash == null) || (referrerHash.equals(indexURL.dummyHash))) return null; if ((referrerHash == null) || (referrerHash.equals(indexURL.dummyHash))) return null;
plasmaCrawlLURLEntry entry = lurls.load(referrerHash, null); plasmaCrawlLURLEntry entry = lurls.load(referrerHash, null);
if (entry == null) referrerURL = null; else referrerURL = entry.url(); if (entry == null) referrerURL = null; else referrerURL = entry.comp().url();
} }
return referrerURL; return referrerURL;
} }

@ -84,7 +84,7 @@ public class plasmaURLPool {
if (ne != null) return ne.url(); if (ne != null) return ne.url();
} catch (IOException e) {} } catch (IOException e) {}
plasmaCrawlLURLEntry le = loadedURL.load(urlhash, null); plasmaCrawlLURLEntry le = loadedURL.load(urlhash, null);
if (le != null) return le.url(); if (le != null) return le.comp().url();
plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash); plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);
if (ee != null) return ee.url(); if (ee != null) return ee.url();
return null; return null;

@ -780,7 +780,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
if (ue == null) { if (ue == null) {
urlHashs.add(entry.urlHash()); urlHashs.add(entry.urlHash());
} else { } else {
url = ue.url(); url = ue.comp().url();
if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url) == true)) { if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url) == true)) {
urlHashs.add(entry.urlHash()); urlHashs.add(entry.urlHash());
} }

@ -46,7 +46,7 @@ public class bitfield {
public bitfield(int bytelength) { public bitfield(int bytelength) {
this.bb= new byte[bytelength]; this.bb= new byte[bytelength];
for (int i = 0 ; i < bytelength; i++) bb[i] = (char) 48; for (int i = 0 ; i < bytelength; i++) bb[i] = 0;
} }
public bitfield(byte[] field) { public bitfield(byte[] field) {

@ -502,7 +502,9 @@ public final class yacyClient {
for (int n = 0; n < results; n++) { for (int n = 0; n < results; n++) {
// get one single search result // get one single search result
urlEntry = urlManager.newEntry((String) result.get("resource" + n), true); urlEntry = urlManager.newEntry((String) result.get("resource" + n), true);
if ((urlEntry == null) || (blacklist.isListed(plasmaURLPattern.BLACKLIST_SEARCH, urlEntry.url()))) { continue; } // block with backlist if (urlEntry == null) continue;
plasmaCrawlLURLEntry.Components comp = urlEntry.comp();
if (blacklist.isListed(plasmaURLPattern.BLACKLIST_SEARCH, comp.url())) continue; // block with backlist
urlManager.store(urlEntry); urlManager.store(urlEntry);
urlManager.stack(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2); urlManager.stack(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
@ -510,19 +512,20 @@ public final class yacyClient {
final indexEntry entry; final indexEntry entry;
if (urlEntry.word() == null) { if (urlEntry.word() == null) {
// the old way to define words // the old way to define words
int urlLength = urlEntry.url().toString().length(); int urlLength = comp.url().toNormalform().length();
int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length; int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()).length;
entry = new indexURLEntry( entry = new indexURLEntry(
urlEntry.hash(), urlEntry.hash(),
urlLength, urlComps, urlLength,
urlEntry.descr().length(), urlComps,
comp.descr().length(),
urlEntry.wordCount(), urlEntry.wordCount(),
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
urlEntry.size(), urlEntry.size(),
urlEntry.moddate().getTime(), urlEntry.moddate().getTime(),
System.currentTimeMillis(), System.currentTimeMillis(),
urlEntry.quality(), 0,
urlEntry.language(), urlEntry.language(),
urlEntry.doctype(), urlEntry.doctype(),
0,0, 0,0,

@ -958,7 +958,8 @@ public final class yacy {
while (eiter.hasNext()) { while (eiter.hasNext()) {
try { try {
entry = (plasmaCrawlLURLEntry) eiter.next(); entry = (plasmaCrawlLURLEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), null); plasmaCrawlLURLEntry.Components comp = entry.comp();
if ((entry != null) && (comp.url() != null)) doms.put(comp.url().getHost(), null);
} catch (Exception e) { } catch (Exception e) {
// here a MalformedURLException may occur // here a MalformedURLException may occur
// just ignore // just ignore
@ -1068,12 +1069,13 @@ public final class yacy {
plasmaCrawlLURLEntry entry; plasmaCrawlLURLEntry entry;
while (eiter.hasNext()) { while (eiter.hasNext()) {
entry = (plasmaCrawlLURLEntry) eiter.next(); entry = (plasmaCrawlLURLEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) { plasmaCrawlLURLEntry.Components comp = entry.comp();
if ((entry != null) && (comp.url() != null)) {
if (html) { if (html) {
bos.write(("<a href=\"" + entry.url() + "\">" + entry.descr() + "</a><br>").getBytes("UTF-8")); bos.write(("<a href=\"" + comp.url().toNormalform() + "\">" + comp.descr() + "</a><br>").getBytes("UTF-8"));
bos.write(serverCore.crlf); bos.write(serverCore.crlf);
} else { } else {
bos.write(entry.url().toString().getBytes()); bos.write(comp.url().toNormalform().getBytes());
bos.write(serverCore.crlf); bos.write(serverCore.crlf);
} }
} }
@ -1128,7 +1130,8 @@ public final class yacy {
plasmaCrawlLURLEntry entry; plasmaCrawlLURLEntry entry;
while (eiter.hasNext()) { while (eiter.hasNext()) {
entry = (plasmaCrawlLURLEntry) eiter.next(); entry = (plasmaCrawlLURLEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) { plasmaCrawlLURLEntry.Components comp = entry.comp();
if ((entry != null) && (comp.url() != null)) {
fsp.put(entry.toRowEntry(), entry.loaddate()); fsp.put(entry.toRowEntry(), entry.loaddate());
} }
} }

Loading…
Cancel
Save