- added new LURL.Entry class for next database migration

- refactoring of affected classes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2802 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent cc24dde5e0
commit b79e06615d

@ -149,15 +149,15 @@ public class Bookmarks {
// try to get the bookmark from the LURL database
plasmaCrawlLURLEntry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null);
plasmaParserDocument document = null;
if(urlentry != null){
document = switchboard.snippetCache.retrieveDocument(urlentry.url(), true);
}
if (urlentry != null) {
plasmaCrawlLURLEntry.Components comp = urlentry.comp();
document = switchboard.snippetCache.retrieveDocument(comp.url(), true);
prop.put("mode_edit", 0); // create mode
prop.put("mode_title", urlentry.descr());
prop.put("mode_description", (document == null) ? urlentry.descr() : document.getMainLongTitle());
prop.put("mode_url", urlentry.url());
prop.put("mode_tags", (document == null) ? "" : document.getKeywords(','));
prop.put("mode_url", comp.url().toNormalform());
prop.put("mode_title", comp.descr());
prop.put("mode_description", (document == null) ? comp.descr(): document.getMainLongTitle());
prop.put("mode_author", comp.author());
prop.put("mode_tags", (document == null) ? comp.tags() : document.getKeywords(','));
prop.put("mode_public", 0);
}
if (document != null) document.close();

@ -164,9 +164,6 @@
<tr><td class="small">Loaded-Date</td><td class="tt">#[loaddate]#</td></tr>
<tr><td class="small">Referrer</td><td class="tt">#[referrer]#</td></tr>
<tr><td class="small">Doctype</td><td class="tt">#[doctype]#</td></tr>
<tr><td class="small">Copy-Count</td><td class="tt">#[copyCount]#</td></tr>
<tr><td class="small">Local-Flag</td><td class="tt">#[local]#</td></tr>
<tr><td class="small">Quality</td><td class="tt">#[quality]#</td></tr>
<tr><td class="small">Language</td><td class="tt">#[language]#</td></tr>
<tr><td class="small">Size</td><td class="tt">#[size]#</td></tr>
<tr><td class="small">Words</td><td class="tt">#[wordCount]#</td></tr>

@ -222,8 +222,7 @@ public class IndexControl_p {
if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {
URL url = entry.url();
urlstring = url.toNormalform();
urlstring = entry.comp().url().toNormalform();
prop.put("urlstring", "");
switchboard.urlPool.loadedURL.remove(urlhash);
prop.put("result", "Removed URL " + urlstring);
@ -339,9 +338,7 @@ public class IndexControl_p {
if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash);
} else {
URL url = entry.url();
urlstring = url.toString();
prop.put("urlstring", urlstring);
prop.put("urlstring", entry.comp().url().toNormalform());
prop.putAll(genUrlProfile(switchboard, entry, urlhash));
}
}
@ -410,30 +407,27 @@ public class IndexControl_p {
prop.put("genUrlProfile_urlhash", urlhash);
return prop;
}
URL url = entry.url();
plasmaCrawlLURLEntry.Components comp = entry.comp();
String referrer = null;
plasmaCrawlLURLEntry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null);
if (le == null) {
referrer = "<unknown>";
} else {
referrer = le.url().toString();
referrer = le.comp().url().toNormalform();
}
if (url == null) {
if (comp.url() == null) {
prop.put("genUrlProfile", 1);
prop.put("genUrlProfile_urlhash", urlhash);
return prop;
}
prop.put("genUrlProfile", 2);
prop.put("genUrlProfile_urlNormalform", url.toNormalform());
prop.put("genUrlProfile_urlNormalform", comp.url().toNormalform());
prop.put("genUrlProfile_urlhash", urlhash);
prop.put("genUrlProfile_urlDescr", entry.descr());
prop.put("genUrlProfile_urlDescr", comp.descr());
prop.put("genUrlProfile_moddate", entry.moddate());
prop.put("genUrlProfile_loaddate", entry.loaddate());
prop.put("genUrlProfile_referrer", referrer);
prop.put("genUrlProfile_doctype", ""+entry.doctype());
prop.put("genUrlProfile_copyCount", entry.copyCount());
prop.put("genUrlProfile_local", ""+entry.local());
prop.put("genUrlProfile_quality", entry.quality());
prop.put("genUrlProfile_language", entry.language());
prop.put("genUrlProfile_size", entry.size());
prop.put("genUrlProfile_wordCount", entry.wordCount());
@ -467,7 +461,7 @@ public class IndexControl_p {
if (le == null) {
tm.put(uh[0], uh);
} else {
us = le.url().toString();
us = le.comp().url().toNormalform();
tm.put(us, uh);
}

@ -337,6 +337,7 @@ public class PerformanceMemory_p {
}
private static void putprop(serverObjects prop, serverSwitch env, String wdb, String db, String set) {
if ((slt == null) || (ost == null)) return;
usd = chk * slt[1] + obj * ost[2] /*hit*/ + kelondroTree.cacheObjectMissSize * ost[3] /*miss*/;
bst = (((((long) chk) * ((long) req)) >> 10) + 1) << 10;
if (set.equals("setBest")) env.setConfig("ramCache" + db, bst);

@ -54,7 +54,6 @@ import java.util.Enumeration;
import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
@ -116,12 +115,12 @@ public class ViewFile {
}
// gettin the url that belongs to the entry
URL url = urlEntry.url();
if (url == null) {
plasmaCrawlLURLEntry.Components comp = urlEntry.comp();
if ((comp == null) || (comp.url() == null)) {
prop.put("error",3);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
}
// loading the resource content as byte array
InputStream resource = null;
@ -130,14 +129,14 @@ public class ViewFile {
String resMime = null;
try {
// trying to load the resource body
resource = sb.cacheManager.getResourceContentStream(url);
resourceLength = sb.cacheManager.getResourceContentLength(url);
resource = sb.cacheManager.getResourceContentStream(comp.url());
resourceLength = sb.cacheManager.getResourceContentLength(comp.url());
// if the resource body was not cached we try to load it from web
if (resource == null) {
plasmaHTCache.Entry entry = null;
try {
entry = sb.snippetCache.loadResourceFromWeb(url, 5000, false);
entry = sb.snippetCache.loadResourceFromWeb(comp.url(), 5000, false);
} catch (plasmaCrawlerException e) {
prop.put("error",4);
prop.put("error_errorText",e.getMessage());
@ -147,8 +146,8 @@ public class ViewFile {
if (entry != null) {
resInfo = entry.getDocumentInfo();
resource = sb.cacheManager.getResourceContentStream(url);
resourceLength = sb.cacheManager.getResourceContentLength(url);
resource = sb.cacheManager.getResourceContentStream(comp.url());
resourceLength = sb.cacheManager.getResourceContentLength(comp.url());
}
if (resource == null) {
@ -164,19 +163,19 @@ public class ViewFile {
// try to load the metadata from cache
try {
resInfo = sb.cacheManager.loadResourceInfo(urlEntry.url());
resInfo = sb.cacheManager.loadResourceInfo(comp.url());
} catch (Exception e) { /* ignore this */}
// if the metadata where not cached try to load it from web
if (resInfo == null) {
String protocol = url.getProtocol();
String protocol = comp.url().getProtocol();
if (!((protocol.equals("http") || protocol.equals("https")))) {
prop.put("error",6);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
httpHeader responseHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig);
httpHeader responseHeader = httpc.whead(comp.url(),comp.url().getHost(),5000,null,null,sb.remoteProxyConfig);
if (responseHeader == null) {
prop.put("error",4);
prop.put("error_errorText","Unable to load resource metadata.");
@ -184,7 +183,7 @@ public class ViewFile {
return prop;
}
try {
resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, responseHeader);
resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(comp.url(), responseHeader);
} catch (Exception e) {
prop.put("error",4);
prop.put("error_errorText",e.getMessage());
@ -230,12 +229,12 @@ public class ViewFile {
prop.put("viewMode_plainText",content);
} else if (viewMode.equals("iframe")) {
prop.put("viewMode",VIEW_MODE_AS_IFRAME);
prop.put("viewMode_url",url.toString());
prop.put("viewMode_url",comp.url().toNormalform());
} else if (viewMode.equals("parsed") || viewMode.equals("sentences")) {
// parsing the resource content
plasmaParserDocument document = null;
try {
document = sb.snippetCache.parseDocument(url, resourceLength, resource,resInfo);
document = sb.snippetCache.parseDocument(comp.url(), resourceLength, resource,resInfo);
if (document == null) {
prop.put("error",5);
prop.put("error_errorText","Unknown error");
@ -295,13 +294,13 @@ public class ViewFile {
}
if (document != null) document.close();
}
prop.put("error",0);
prop.put("error_url",url.toString());
prop.put("error_hash",urlHash);
prop.put("error_wordCount",Integer.toString(urlEntry.wordCount()));
prop.put("error_desc",urlEntry.descr());
prop.put("error_size",urlEntry.size());
prop.put("error_mimeType",resMime);
prop.put("error", 0);
prop.put("error_url", comp.url().toNormalform());
prop.put("error_hash", urlHash);
prop.put("error_wordCount", Integer.toString(urlEntry.wordCount()));
prop.put("error_desc", comp.descr());
prop.put("error_size", urlEntry.size());
prop.put("error_mimeType", resMime);
return prop;
}

@ -359,7 +359,7 @@ public class dir {
final URL url = new URL(urlstring);
final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()));
final plasmaCrawlLURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry(
url, "YaCyShare: " + descr, new Date(), new Date(),
url.toNormalform(), "YaCyShare: " + descr, new Date(), new Date(),
"AAAAAAAAAAAA", /*referrer*/
0, /*copycount*/
false, /*localneed*/

@ -125,8 +125,13 @@ public final class crawlReceipt {
} else if (result.equals("fill")) {
// generating a new loaded URL entry
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr, true);
if ((entry == null)||(entry.url()==null)) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT for hash " + receivedUrlhash + " from peer " + iam +
if (entry == null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) for hash " + receivedUrlhash + " from peer " + iam +
"\n\tURL properties: "+ propStr);
} else {
plasmaCrawlLURLEntry.Components comp = entry.comp();
if (comp.url() == null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + receivedUrlhash + " from peer " + iam +
"\n\tURL properties: "+ propStr);
} else try {
// put new entry into database
@ -134,18 +139,18 @@ public final class crawlReceipt {
switchboard.urlPool.loadedURL.stack(entry, youare, iam, 1);
// generating url hash
String newUrlHash = indexURL.urlHash(entry.url());
String oldUrlHash = indexURL.oldurlHash(entry.url());
String newUrlHash = indexURL.urlHash(comp.url());
String oldUrlHash = indexURL.oldurlHash(comp.url());
// removing URL from notice URL
switchboard.urlPool.noticeURL.remove(newUrlHash);
switchboard.urlPool.noticeURL.remove(oldUrlHash);
log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + receivedUrlhash + ":" + entry.url());
log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + receivedUrlhash + ":" + comp.url().toNormalform());
} catch (IOException e) {
e.printStackTrace();
}
}
// ready for more
prop.put("delay", "10");
} else {

@ -249,7 +249,7 @@ public final class search {
while ((acc.hasMoreElements()) && (i < squery.wantedResults)) {
urlentry = (plasmaCrawlLURLEntry) acc.nextElement();
if (includesnippet) {
snippet = sb.snippetCache.retrieveSnippet(urlentry.url(), squery.queryHashes, false, 260, 1000);
snippet = sb.snippetCache.retrieveSnippet(urlentry.comp().url(), squery.queryHashes, false, 260, 1000);
} else {
snippet = null;
}

@ -98,25 +98,29 @@ public final class transferURL {
yacyCore.log.logFine("transferURL: got null URL-string from peer " + otherPeerName);
} else {
lEntry = sb.urlPool.loadedURL.newEntry(urls, true);
if ((lEntry != null) && (lEntry.url() != null)) {
if ((blockBlacklist) &&
(plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), lEntry.url()))) {
int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
lEntry = null;
blocked++;
} else try {
sb.urlPool.loadedURL.store(lEntry);
sb.urlPool.loadedURL.stack(lEntry, iam, iam, 3);
yacyCore.log.logFine("transferURL: received URL '" + lEntry.url() + "' from peer " + otherPeerName);
received++;
} catch (IOException e) {
e.printStackTrace();
}
} else {
yacyCore.log.logWarning("transferURL: received invalid URL from peer " + otherPeerName +
"\n\tURL Property: " + urls);
if (lEntry == null) {
yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
// TODO: should we send back an error message???
} else {
plasmaCrawlLURLEntry.Components comp = lEntry.comp();
if (comp.url() == null) {
yacyCore.log.logWarning("transferURL: received invalid URL (url null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
// TODO: should we send back an error message???
} else {
if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), comp.url()))) {
int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
lEntry = null;
blocked++;
} else try {
sb.urlPool.loadedURL.store(lEntry);
sb.urlPool.loadedURL.stack(lEntry, iam, iam, 3);
yacyCore.log.logFine("transferURL: received URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName);
received++;
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}

@ -191,13 +191,15 @@ public class yacysearch {
final String recommendHash = post.get("recommendref", ""); // urlhash
plasmaCrawlLURLEntry urlentry = sb.urlPool.loadedURL.load(recommendHash, null);
if (urlentry != null) {
plasmaParserDocument document = sb.snippetCache.retrieveDocument(urlentry.url(), true);
plasmaCrawlLURLEntry.Components comp = urlentry.comp();
plasmaParserDocument document;
document = sb.snippetCache.retrieveDocument(comp.url(), true);
if (document != null) {
// create a news message
HashMap map = new HashMap();
map.put("url", urlentry.url().toNormalform().replace(',', '|'));
map.put("title", urlentry.descr().replace(',', ' '));
map.put("description", ((document == null) ? urlentry.descr() : document.getMainLongTitle()).replace(',', ' '));
map.put("url", comp.url().toNormalform().replace(',', '|'));
map.put("title", comp.descr().replace(',', ' '));
map.put("description", ((document == null) ? comp.descr() : document.getMainLongTitle()).replace(',', ' '));
map.put("tags", ((document == null) ? "" : document.getKeywords(' ')));
yacyCore.newsPool.publishMyNews(new yacyNewsRecord("stippadd", map));
document.close();

@ -52,6 +52,7 @@ public class indexURL {
public static final int urlStringLength = 256;// not too short for links without parameters
public static final int urlDescrLength = 80; // The headline of a web page (meta-tag or <h1>)
public static final int urlNameLength = 40; // the tag content between <a> and </a>
public static final int urldescrtagsLength = 320;// the url, the description and tags in one string
public static final int urlErrorLength = 80; // a reason description for unavailable urls
public static final int urlDateLength = 4; // any date, shortened
public static final int urlCopyCountLength = 2; // counter for numbers of copies of this index

@ -158,7 +158,10 @@ public class kelondroCollectionIndex {
ientry.setCol(idx_col_indexpos, j);
ientry.setCol(idx_col_lastread, t);
ientry.setCol(idx_col_lastwrote, t);
index.put(ientry);
if (index instanceof kelondroBufferedIndex)
((kelondroBufferedIndex) index).add(ientry);
else
index.put(ientry);
// write a log
if (System.currentTimeMillis() - lastlog > 30000) {

@ -40,7 +40,7 @@ public class kelondroRow {
protected kelondroColumn[] row;
protected int[] colstart;
protected int objectsize;
protected Map nickref = null;
protected Map nickref = null; // a mapping from nicknames to Object[2]{kelondroColumn, Integer(colstart)}
public kelondroRow(kelondroColumn[] row) {
this.row = row;
@ -142,7 +142,12 @@ public class kelondroRow {
if (external == null) return null;
return new Entry(external);
}
/*
public Entry newEntry(Properties prop) {
if (prop == null) return null;
return new Entry(prop);
}
*/
public class Entry implements Comparable {
private byte[] rowinstance;
@ -202,7 +207,19 @@ public class kelondroRow {
}
}
}
/*
public Entry(Properties prop) {
// parse external form
if (nickref == null) genNickRef();
rowinstance = new byte[objectsize];
Iterator i = prop.entrySet().iterator();
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
setCol(((String) entry.getKey()).trim(), ((String) entry.getValue()).trim().getBytes());
}
}
*/
public int compareTo(Object o) {
if (o instanceof Entry) {
return kelondroNaturalOrder.naturalOrder.compare(this.rowinstance, ((Entry) o).rowinstance);
@ -354,7 +371,7 @@ public class kelondroRow {
return getColLong(row[column].encoder(), colstart[column], row[column].cellwidth());
}
public long getColLong(int encoder, int offset, int length) {
private long getColLong(int encoder, int offset, int length) {
// start - fix for badly stored parameters
if ((length >= 3) && (rowinstance[offset] == '[') && (rowinstance[offset + 1] == 'B') && (rowinstance[offset + 2] == '@')) return 0;
if ((length == 2) && (rowinstance[offset] == '[') && (rowinstance[offset + 1] == 'B')) return 0;
@ -378,6 +395,13 @@ public class kelondroRow {
throw new kelondroException("ROW", "getColLong did not find appropriate encoding");
}
public byte getColByte(String nickname, byte dflt) {
if (nickref == null) genNickRef();
Object[] ref = (Object[]) nickref.get(nickname);
if (ref == null) return dflt;
return rowinstance[((Integer) ref[1]).intValue()];
}
public byte getColByte(int column) {
return rowinstance[colstart[column]];
}

@ -195,7 +195,7 @@ public final class plasmaCrawlLURL extends indexURL {
}
}
public synchronized plasmaCrawlLURLEntry newEntry(URL url, String descr, Date moddate, Date loaddate,
public synchronized plasmaCrawlLURLEntry newEntry(String url, String descr, Date moddate, Date loaddate,
String referrerHash, int copyCount, boolean localNeed,
int quality, String language, char doctype,
int size, int wordCount) {
@ -338,7 +338,6 @@ public final class plasmaCrawlLURL extends indexURL {
String cachepath, urlstr, urltxt;
yacySeed initiatorSeed, executorSeed;
plasmaCrawlLURLEntry urle;
URL url;
// needed for getCachePath(url)
final plasmaSwitchboard switchboard = plasmaSwitchboard.getSwitchboard();
@ -353,14 +352,14 @@ public final class plasmaCrawlLURL extends indexURL {
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash);
try {
urle = load(urlHash, null);
plasmaCrawlLURLEntry.Components comp = urle.comp();
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString());
initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash);
executorSeed = yacyCore.seedDB.getConnected(executorHash);
url = urle.url();
urlstr = url.toString();
urlstr = comp.url().toNormalform();
urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
cachepath = (url == null) ? "-not-cached-" : cacheManager.getCachePath(url).toString().replace('\\', '/').substring(cacheManager.cachePath.toString().length() + 1);
cachepath = cacheManager.getCachePath(new URL(urlstr)).toString().replace('\\', '/').substring(cacheManager.cachePath.toString().length() + 1);
prop.put("table_indexed_" + cnt + "_dark", (dark) ? 1 : 0);
prop.put("table_indexed_" + cnt + "_feedbackpage", feedbackpage);
@ -372,8 +371,8 @@ public final class plasmaCrawlLURL extends indexURL {
prop.put("table_indexed_" + cnt + "_showExec_executorSeed", (executorSeed == null) ? dfltExec : executorSeed.getName());
prop.put("table_indexed_" + cnt + "_moddate", daydate(urle.moddate()));
prop.put("table_indexed_" + cnt + "_wordcount", urle.wordCount());
prop.put("table_indexed_" + cnt + "_urldescr", urle.descr());
prop.put("table_indexed_" + cnt + "_url", (urle.url() == null) ? "-not-cached-" : ((makeLink) ? ("<a href=\"CacheAdmin_p.html?action=info&path=" + cachepath + "\" class=\"small\" title=\"" + urlstr + "\">" + urltxt + "</a>") : urlstr));
prop.put("table_indexed_" + cnt + "_urldescr", comp.descr());
prop.put("table_indexed_" + cnt + "_url", (cachepath == null) ? "-not-cached-" : ((makeLink) ? ("<a href=\"CacheAdmin_p.html?action=info&path=" + cachepath + "\" class=\"small\" title=\"" + urlstr + "\">" + urltxt + "</a>") : urlstr));
dark = !dark;
cnt++;
} catch (Exception e) {
@ -535,18 +534,19 @@ public final class plasmaCrawlLURL extends indexURL {
}
plasmaCrawlLURLEntry entry = (plasmaCrawlLURLEntry) eiter.next();
plasmaCrawlLURLEntry.Components comp = entry.comp();
totalSearchedUrls++;
if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, entry.url()) ||
plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, entry.url())) {
lastBlacklistedUrl = entry.url().toString();
if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, comp.url()) ||
plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, comp.url())) {
lastBlacklistedUrl = comp.url().toNormalform();
lastBlacklistedHash = entry.hash();
serverLog.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double)blacklistedUrls/totalSearchedUrls)*100 + "%): " + entry.hash() + " " + entry.url());
serverLog.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double)blacklistedUrls/totalSearchedUrls)*100 + "%): " + entry.hash() + " " + comp.url().toNormalform());
remove(entry.hash());
if (blacklistedUrls % 100 == 0) {
serverLog.logInfo("URLDBCLEANER", "Deleted " + blacklistedUrls + " URLs until now. Last deleted URL-Hash: " + lastBlacklistedUrl);
}
}
lastUrl = entry.url().toString();
lastUrl = comp.url().toNormalform();
lastHash = entry.hash();
}
} catch (RuntimeException e) {
@ -605,7 +605,7 @@ public final class plasmaCrawlLURL extends indexURL {
final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), 1, 0, false);
final Iterator enu = urls.entries(true, false, null);
while (enu.hasNext()) {
((plasmaCrawlLURLEntry) enu.next()).print();
System.out.println(((plasmaCrawlLURLEntry) enu.next()).toString());
}
} catch (Exception e) {
e.printStackTrace();

@ -27,10 +27,11 @@
package de.anomic.plasma;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
import de.anomic.net.URL;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.index.indexEntry;
public interface plasmaCrawlLURLEntry {
@ -39,9 +40,7 @@ public interface plasmaCrawlLURLEntry {
public String hash();
public URL url();
public String descr();
public Components comp();
public Date moddate();
@ -51,12 +50,6 @@ public interface plasmaCrawlLURLEntry {
public char doctype();
public int copyCount();
public boolean local();
public int quality();
public String language();
public int size();
@ -73,6 +66,26 @@ public interface plasmaCrawlLURLEntry {
public String toString();
public void print();
public class Components {
private URL url;
private String descr, author, tags, ETag;
public Components(String url, String descr, String author, String tags, String ETag) {
try {
this.url = new URL(url);
} catch (MalformedURLException e) {
this.url = null;
}
this.descr = descr;
this.author = author;
this.tags = tags;
this.ETag = ETag;
}
public URL url() { return this.url; }
public String descr() { return this.descr; }
public String author() { return this.author; }
public String tags() { return this.tags; }
public String ETag() { return this.ETag; }
}
}

@ -0,0 +1,337 @@
package de.anomic.plasma;
import java.io.IOException;
import java.net.MalformedURLException;
import java.text.ParseException;
import java.util.Date;
import java.util.Properties;
import java.util.ArrayList;
import de.anomic.index.indexEntry;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverCodings;
import de.anomic.tools.crypt;
import de.anomic.tools.bitfield;
import de.anomic.tools.nxTools;
public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry {
public static final kelondroRow rowdef = new kelondroRow(
"String hash-12, " + // the url's hash
"String comp-360, " + // components: the url, description, author and tags. As 5th element, an ETag is possible
"Cardinal mod-4 {b256}, " + // last-modified from the httpd
"Cardinal load-4 {b256}, " + // time when the url was loaded
"String referrer-12, " + // (one of) the url's referrer hash(es)
"byte[] md5-8" + // the md5 of the url content (to identify changes)
"Cardinal size-6 {b256}, " + // size of file in bytes
"Cardinal wc-3 {b256}, " + // size of file by number of words; for video and audio: seconds
"byte[] dt-1, " + // doctype, taken from extension or any other heuristic
"byte[] flags-4, " + // flags; any stuff (see Word-Entity definition)
"String lang-2, " + // language
"Cardinal llocal-2 {b256}, " + // # of outlinks to same domain; for video and image: width
"Cardinal lother-2 {b256}, " + // # of outlinks to outside domain; for video and image: height
"Cardinal limage-2 {b256}, " + // # of embedded image links
"Cardinal laudio-2 {b256}, " + // # of embedded audio links; for audio: track number; for video: number of audio tracks
"Cardinal lvideo-2 {b256}, " + // # of embedded video links
"Cardinal lapp-2 {b256}"); // # of embedded links to applications
private kelondroRow.Entry entry;
private String snippet;
private indexEntry word; // this is only used if the url is transported via remote search requests
public plasmaCrawlLURLNewEntry(
URL url,
String descr,
String author,
String tags,
String ETag,
Date mod,
Date load,
String referrer,
byte[] md5,
long size,
int wc,
byte dt,
bitfield flags,
String lang,
int llocal,
int lother,
int laudio,
int limage,
int lvideo,
int lapp) {
// create new entry and store it into database
this.entry = rowdef.newEntry();
this.entry.setCol("hash", indexURL.urlHash(url), null);
this.entry.setCol("comp", encodeComp(url, descr, author, tags, ETag));
this.entry.setCol("mod", encodeDate(mod));
this.entry.setCol("load", encodeDate(load));
this.entry.setCol("referrer", referrer.getBytes());
this.entry.setCol("md5", md5);
this.entry.setCol("size", size);
this.entry.setCol("wc", wc);
this.entry.setCol("dt", dt);
this.entry.setCol("flags", flags.getBytes());
this.entry.setCol("lang", lang.getBytes());
this.entry.setCol("llocal", llocal);
this.entry.setCol("lother", lother);
this.entry.setCol("limage", limage);
this.entry.setCol("laudio", laudio);
this.entry.setCol("lvideo", lvideo);
this.entry.setCol("lapp", lapp);
this.snippet = null;
this.word = null;
}
byte[] encodeDate(Date d) {
return kelondroNaturalOrder.encodeLong(d.getTime() / 86400000, 4);
}
byte[] encodeComp(URL url, String descr, String author, String tags, String ETag) {
serverByteBuffer s = new serverByteBuffer(200);
s.append(url.toNormalform()).append((char) 10);
s.append(author).append((char) 10);
s.append(tags).append((char) 10);
s.append(ETag).append((char) 10);
return s.getBytes();
}
public plasmaCrawlLURLNewEntry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException {
this.entry = entry;
this.snippet = null;
this.word = searchedWord;
}
public plasmaCrawlLURLNewEntry(Properties prop, boolean setGlobal) throws IOException {
// generates an plasmaLURLEntry using the properties from the argument
// the property names must correspond to the one from toString
//System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
URL url;
try {
url = new URL(crypt.simpleDecode(prop.getProperty("url", ""), null));
} catch (MalformedURLException e) {
throw new IOException("URL is not proper: " + crypt.simpleDecode(prop.getProperty("url", ""), null));
}
String descr = crypt.simpleDecode(prop.getProperty("descr", ""), null); if (descr == null) descr = "";
String author = crypt.simpleDecode(prop.getProperty("author", ""), null); if (author == null) author = "";
String tags = crypt.simpleDecode(prop.getProperty("tags", ""), null); if (tags == null) tags = "";
String ETag = crypt.simpleDecode(prop.getProperty("ETag", ""), null); if (ETag == null) ETag = "";
this.entry = rowdef.newEntry();
this.entry.setCol("hash", indexURL.urlHash(url), null);
this.entry.setCol("comp", encodeComp(url, descr, author, tags, ETag));
try {
this.entry.setCol("mod", encodeDate(indexURL.shortDayFormatter.parse(prop.getProperty("mod", "20000101"))));
} catch (ParseException e) {
this.entry.setCol("mod", encodeDate(new Date()));
}
try {
this.entry.setCol("load", encodeDate(indexURL.shortDayFormatter.parse(prop.getProperty("load", "20000101"))));
} catch (ParseException e) {
this.entry.setCol("load", encodeDate(new Date()));
}
this.entry.setCol("referrer", prop.getProperty("referrer", indexURL.dummyHash).getBytes());
this.entry.setCol("md5", serverCodings.decodeHex(prop.getProperty("md5", indexURL.dummyHash)));
this.entry.setCol("size", Integer.parseInt(prop.getProperty("size", "0")));
this.entry.setCol("wc", Integer.parseInt(prop.getProperty("wc", "0")));
this.entry.setCol("dt", prop.getProperty("dt", "t").charAt(0));
this.entry.setCol("flags", serverCodings.decodeHex(prop.getProperty("flags", "00000000")));
this.entry.setCol("lang", prop.getProperty("lang", "uk").getBytes());
this.entry.setCol("llocal", Integer.parseInt(prop.getProperty("llocal", "0")));
this.entry.setCol("lother", Integer.parseInt(prop.getProperty("lother", "0")));
this.entry.setCol("limage", Integer.parseInt(prop.getProperty("limage", "0")));
this.entry.setCol("laudio", Integer.parseInt(prop.getProperty("laudio", "0")));
this.entry.setCol("lvideo", Integer.parseInt(prop.getProperty("lvideo", "0")));
this.entry.setCol("lapp", Integer.parseInt(prop.getProperty("lapp", "0")));
this.snippet = crypt.simpleDecode(prop.getProperty("snippet", ""), null);
this.word = (prop.containsKey("word")) ? new indexURLEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))) : null;
}
private StringBuffer corePropList() {
// generate a parseable string; this is a simple property-list
plasmaCrawlLURLEntry.Components comp = this.comp();
final StringBuffer s = new StringBuffer(300);
try {
s.append("hash=").append(hash());
s.append(",url=").append(crypt.simpleEncode(comp.url().toNormalform()));
s.append(",descr=").append(crypt.simpleEncode(comp.descr()));
s.append(",author=").append(crypt.simpleEncode(comp.author()));
s.append(",tags=").append(crypt.simpleEncode(comp.tags()));
s.append(",ETag=").append(crypt.simpleEncode(comp.ETag()));
s.append(",mod=").append(indexURL.shortDayFormatter.format(moddate()));
s.append(",load=").append(indexURL.shortDayFormatter.format(loaddate()));
s.append(",referrer=").append(referrerHash());
s.append(",md5=").append(md5());
s.append(",size=").append(size());
s.append(",wc=").append(wordCount());
s.append(",dt=").append(doctype());
s.append(",flags=").append(serverCodings.encodeHex(flags().getBytes()));
s.append(",lang=").append(language());
s.append(",llocal=").append(llocal());
s.append(",lother=").append(lother());
s.append(",limage=").append(limage());
s.append(",laudio=").append(laudio());
s.append(",lvideo=").append(lvideo());
s.append(",lapp=").append(lapp());
if (this.word != null) {
// append also word properties
s.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm(false)));
}
return s;
} catch (Exception e) {
// serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
// if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
// if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null");
// e.printStackTrace();
return null;
}
}
public kelondroRow.Entry toRowEntry() throws IOException {
return this.entry;
}
public String hash() {
// return a url-hash, based on the md5 algorithm
// the result is a String of 12 bytes within a 72-bit space
// (each byte has an 6-bit range)
// that should be enough for all web pages on the world
return this.entry.getColString("hash", "", null);
}
public de.anomic.plasma.plasmaCrawlLURLEntry.Components comp() {
ArrayList cl = nxTools.strings(this.entry.getCol("comp", null), "UTF-8");
return new de.anomic.plasma.plasmaCrawlLURLEntry.Components(
(cl.size() > 0) ? (String) cl.get(0) : "",
(cl.size() > 1) ? (String) cl.get(1) : "",
(cl.size() > 2) ? (String) cl.get(2) : "",
(cl.size() > 3) ? (String) cl.get(3) : "",
(cl.size() > 4) ? (String) cl.get(4) : "");
}
public Date moddate() {
return new Date(86400000 * entry.getColLong("mod", 0));
}
public Date loaddate() {
return new Date(86400000 * entry.getColLong("load", 0));
}
public String referrerHash() {
// return the creator's hash
return entry.getColString("referrer", indexURL.dummyHash, null);
}
public String md5() {
// returns the md5 in hex representation
return serverCodings.encodeHex(entry.getCol("md5", indexURL.dummyHash.getBytes()));
}
public char doctype() {
return (char) entry.getColByte("dt", (byte) 't');
}
public String language() {
return this.entry.getColString("lang", "uk", null);
}
public int size() {
return (int) this.entry.getColLong("size", 0);
}
public bitfield flags() {
return new bitfield(this.entry.getCol("flags", new byte[4]));
}
public int wordCount() {
return (int) this.entry.getColLong("wc", 0);
}
public int llocal() {
return (int) this.entry.getColLong("llocal", 0);
}
public int lother() {
return (int) this.entry.getColLong("lother", 0);
}
public int limage() {
return (int) this.entry.getColLong("limage", 0);
}
public int laudio() {
return (int) this.entry.getColLong("laudio", 0);
}
public int lvideo() {
return (int) this.entry.getColLong("lvideo", 0);
}
public int lapp() {
return (int) this.entry.getColLong("lapp", 0);
}
public String snippet() {
// the snippet may appear here if the url was transported in a remote search
// it will not be saved anywhere, but can only be requested here
return snippet;
}
public indexEntry word() {
return word;
}
public boolean isOlder(plasmaCrawlLURLEntry other) {
if (other == null) return false;
Date tmoddate = moddate();
Date omoddate = other.moddate();
if (tmoddate.before(omoddate)) return true;
if (tmoddate.equals(omoddate)) {
Date tloaddate = loaddate();
Date oloaddate = other.loaddate();
if (tloaddate.before(oloaddate)) return true;
if (tloaddate.equals(oloaddate)) return true;
}
return false;
}
public String toString(String snippet) {
// add information needed for remote transport
final StringBuffer core = corePropList();
if (core == null)
return null;
core.ensureCapacity(core.length() + snippet.length() * 2);
core.insert(0, "{");
core.append(",snippet=").append(crypt.simpleEncode(snippet));
core.append("}");
return core.toString();
//return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
}
/**
* Returns this object as String.<br>
* This e.g. looks like this:
* <pre>{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}</pre>
*/
public String toString() {
final StringBuffer core = corePropList();
if (core == null) return null;
core.insert(0, "{");
core.append("}");
return core.toString();
//return "{" + core + "}";
}
}

@ -36,7 +36,6 @@ import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.crypt;
@ -57,7 +56,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
"Cardinal size-" + indexURL.urlSizeLength + " {b64e}, " + // size of file in bytes
"Cardinal wc-" + indexURL.urlWordCountLength + " {b64e}"); // word count
private URL url;
private String url;
private String descr;
private Date moddate;
private Date loaddate;
@ -73,19 +72,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
private String snippet;
private indexEntry word; // this is only used if the url is transported via remote search requests
// more needed attributes:
// - author / copyright owner
// - keywords
// - phrasecount, total number of phrases
// - boolean: URL attributes (see Word-Entity definition)
// - boolean: appearance of bold and/or italics
// - ETag: for re-crawl decision upon HEAD request
// - int: # of outlinks to same domain
// - int: # of outlinks to outside domain
// - int: # of keywords
// - int: # der auf der Seite vorhandenen Links zu image, audio, video, applications
public plasmaCrawlLURLOldEntry(URL url, String descr, Date moddate,
public plasmaCrawlLURLOldEntry(String url, String descr, Date moddate,
Date loaddate, String referrerHash, int copyCount,
boolean localNeed, int quality, String language, char doctype,
int size, int wordCount) {
@ -110,7 +97,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
public plasmaCrawlLURLOldEntry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException {
try {
this.urlHash = entry.getColString(0, null);
this.url = new URL(entry.getColString(1, "UTF-8").trim());
this.url = entry.getColString(1, "UTF-8").trim();
this.descr = (entry.empty(2)) ? this.url.toString() : entry.getColString(2, "UTF-8").trim();
this.moddate = new Date(86400000 * entry.getColLong(3));
this.loaddate = new Date(86400000 * entry.getColLong(4));
@ -144,7 +131,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
this.copyCount = Integer.parseInt(prop.getProperty("cc", "0"));
this.flags = ((prop.getProperty("local", "true").equals("true")) ? "L " : " ");
if (setGlobal) this.flags = "G ";
this.url = new URL(crypt.simpleDecode(prop.getProperty("url", ""), null));
this.url = crypt.simpleDecode(prop.getProperty("url", ""), null);
this.descr = crypt.simpleDecode(prop.getProperty("descr", ""), null);
if (this.descr == null) this.descr = this.url.toString();
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(prop.getProperty("q", ""));
@ -195,13 +182,9 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
// that should be enough for all web pages on the world
return this.urlHash;
}
public URL url() {
return url;
}
public String descr() {
return descr;
public Components comp() {
return new Components(url, descr, "", "", "");
}
public Date moddate() {
@ -263,9 +246,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
if (moddate.before(other.moddate())) return true;
if (moddate.equals(other.moddate())) {
if (loaddate.before(other.loaddate())) return true;
if (loaddate.equals(other.loaddate())) {
if (quality < other.quality()) return true;
}
if (loaddate.equals(other.loaddate())) return true;
}
return false;
}
@ -297,30 +278,10 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry {
return corePropStr;
} catch (Exception e) {
// serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
// if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
// if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null");
// e.printStackTrace();
return null;
}
}
/*
public String toString(int posintext, int posinphrase, int posofphrase) {
// add information needed for remote transport
final StringBuffer core = corePropList();
if (core == null) return null;
core.ensureCapacity(core.length() + 200);
core.insert(0,"{")
.append(",posintext=").append(posintext)
.append(",posinphrase=").append(posinphrase)
.append(",posofphraseint=").append(posofphrase)
.append("}");
return core.toString();
}
*/
public String toString(String snippet) {
// add information needed for remote transport
final StringBuffer core = corePropList();

@ -227,7 +227,7 @@ public class plasmaDHTChunk {
while ((urlIter.hasNext()) && (maxcount > refcount) && (System.currentTimeMillis() < timeout)) {
iEntry = (indexEntry) urlIter.next();
lurl = lurls.load(iEntry.urlHash(), iEntry);
if ((lurl == null) || (lurl.url() == null)) {
if ((lurl == null) || (lurl.comp().url() == null)) {
//yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash());
notBoundCounter++;
urlIter.remove();

@ -104,7 +104,7 @@ public final class plasmaSearchImages {
plasmaCrawlLURLEntry urlentry;
while (sres.hasMoreElements()) {
urlentry = sres.nextElement();
addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), urlentry.url(), depth));
addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), urlentry.comp().url(), depth));
}
}

@ -197,8 +197,9 @@ public class plasmaSearchRankingProfile {
long ranking = preranking;
// prefer hit with 'prefer' pattern
if (page.url().toString().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();
if (page.descr().toString().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();
plasmaCrawlLURLEntry.Components comp = page.comp();
if (comp.url().toNormalform().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();
if (comp.descr().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();
// apply 'common-sense' heuristic using references
for (int j = 0; j < urlcomps.length; j++) {
@ -220,11 +221,11 @@ public class plasmaSearchRankingProfile {
}
// prefer short urls
ranking += (256 - page.url().toString().length()) << ((Integer) coeff.get(URLLENGTH)).intValue();
ranking += (256 - comp.url().toNormalform().length()) << ((Integer) coeff.get(URLLENGTH)).intValue();
ranking += (8 * Math.max(0, 32 - urlcomps.length)) << ((Integer) coeff.get(URLCOMPS)).intValue();
// prefer long descriptions
ranking += (256 * page.descr().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue();
ranking += (256 * comp.url().toNormalform().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue();
ranking += (256 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue();
return ranking;

@ -108,11 +108,10 @@ public final class plasmaSearchResult {
protected void addResult(plasmaCrawlLURLEntry page, Long preranking) {
// take out relevant information for reference computation
URL url = page.url();
String descr = page.descr();
if ((url == null) || (descr == null)) return;
String[] urlcomps = htmlFilterContentScraper.urlComps(url.toString()); // word components of the url
String[] descrcomps = descr.toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description
plasmaCrawlLURLEntry.Components comp = page.comp();
if ((comp.url() == null) || (comp.descr() == null)) return;
String[] urlcomps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()); // word components of the url
String[] descrcomps = comp.descr().toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description
// store everything
results.add(new Object[] {page, urlcomps, descrcomps, preranking});
@ -168,12 +167,12 @@ public final class plasmaSearchResult {
Iterator i = pageAcc.entrySet().iterator();
HashMap paths = new HashMap(); // a url-subpath to pageAcc-key relation
Map.Entry entry;
String path;
String path = null;
// first scan all entries and find all urls that are referenced
while (i.hasNext()) {
entry = (Map.Entry) i.next();
path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).url());
path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).comp().url());
paths.put(path, entry.getKey());
//if (path != null) path = shortenPath(path);
//if (path != null) paths.put(path, entry.getKey());
@ -184,7 +183,7 @@ public final class plasmaSearchResult {
String shorten;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).url());
path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).comp().url());
shorten = shortenPath(path);
// scan all subpaths of the url
while (shorten != null) {

@ -289,6 +289,7 @@ public class plasmaSnippetCache {
* @return the parsed document as {@link plasmaParserDocument}
*/
public plasmaParserDocument retrieveDocument(URL url, boolean fetchOnline) {
if (url == null) return null;
IResourceInfo docInfo = null;
try {
// trying to load the resource body from cache
@ -634,11 +635,12 @@ public class plasmaSnippetCache {
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) {
urlentry = acc.nextElement();
if (urlentry.url().getHost().endsWith(".yacyh")) continue;
urlstring = urlentry.url().toNormalform();
plasmaCrawlLURLEntry.Components comp = urlentry.comp();
if (comp.url().getHost().endsWith(".yacyh")) continue;
urlstring = comp.url().toNormalform();
if ((urlstring.matches(urlmask)) &&
(!(existsInCache(urlentry.url(), queryhashes)))) {
new Fetcher(urlentry.url(), queryhashes, (int) maxTime).start();
(!(existsInCache(comp.url(), queryhashes)))) {
new Fetcher(comp.url(), queryhashes, (int) maxTime).start();
i++;
}
}

@ -1559,7 +1559,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// create a new loaded URL db entry
plasmaCrawlLURLEntry newEntry = urlPool.loadedURL.newEntry(
entry.url(), // URL
entry.url().toNormalform(), // URL
docDescription, // document description
docDate, // modification date
new Date(), // loaded date
@ -1641,8 +1641,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String language = indexEntryAttribute.language(entry.url());
char doctype = indexEntryAttribute.docType(document.getMimeType());
int urlLength = newEntry.url().toString().length();
int urlComps = htmlFilterContentScraper.urlComps(newEntry.url().toString()).length;
plasmaCrawlLURLEntry.Components comp = newEntry.comp();
int urlLength = comp.url().toNormalform().length();
int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()).length;
// iterate over all words
Iterator i = condenser.words();
@ -2046,10 +2047,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
prop.put("type_globalresults", acc.globalContributions);
int i = 0;
int p;
URL url;
plasmaCrawlLURLEntry urlentry;
String urlstring, urlname, filename, urlhash;
String host, hash, address, descr = "";
String host, hash, address;
yacySeed seed;
plasmaSnippetCache.Snippet snippet;
boolean includeSnippets = false;
@ -2058,30 +2058,29 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (targetTime < System.currentTimeMillis()) targetTime = System.currentTimeMillis() + 1000;
while ((acc.hasMoreElements()) && (i < query.wantedResults) && (System.currentTimeMillis() < targetTime)) {
urlentry = acc.nextElement();
url = urlentry.url();
plasmaCrawlLURLEntry.Components comp = urlentry.comp();
urlhash = urlentry.hash();
host = url.getHost();
host = comp.url().getHost();
if (host.endsWith(".yacyh")) {
// translate host into current IP
p = host.indexOf(".");
hash = yacySeed.hexHash2b64Hash(host.substring(p + 1, host.length() - 6));
seed = yacyCore.seedDB.getConnected(hash);
filename = url.getFile();
filename = comp.url().getFile();
if ((seed == null) || ((address = seed.getAddress()) == null)) {
// seed is not known from here
removeReferences(urlentry.hash(), plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + urlentry.descr()).getBytes()));
removeReferences(urlentry.hash(), plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes()));
urlPool.loadedURL.remove(urlentry.hash()); // clean up
continue; // next result
}
url = new URL("http://" + address + "/" + host.substring(0, p) + filename);
urlname = "http://share." + seed.getName() + ".yacy" + filename;
if ((p = urlname.indexOf("?")) > 0) urlname = urlname.substring(0, p);
urlstring = url.toNormalform();
urlstring = "http://" + address + "/" + host.substring(0, p) + filename;
} else {
urlstring = url.toNormalform();
urlstring = comp.url().toNormalform();
urlname = urlstring;
}
descr = urlentry.descr();
// check bluelist again: filter out all links where any bluelisted word
// appear either in url, url's description or search word
@ -2097,7 +2096,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
URL wordURL;
if (urlstring.matches(query.urlMask)) { //.* is default
if (includeSnippets) {
snippet = snippetCache.retrieveSnippet(url, query.queryHashes, false, 260, 1000);
snippet = snippetCache.retrieveSnippet(comp.url(), query.queryHashes, false, 260, 1000);
} else {
snippet = null;
}
@ -2107,7 +2106,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
prop.put("type_results_" + i + "_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, "stippadd", "url", urlstring) == null) ? 1 : 0);
prop.put("type_results_" + i + "_recommend_deletelink", "/yacysearch.html?search=" + formerSearch + "&Enter=Search&count=" + query.wantedResults + "&order=" + ranking.orderString() + "&resource=local&time=3&deleteref=" + urlhash + "&urlmaskfilter=.*");
prop.put("type_results_" + i + "_recommend_recommendlink", "/yacysearch.html?search=" + formerSearch + "&Enter=Search&count=" + query.wantedResults + "&order=" + ranking.orderString() + "&resource=local&time=3&recommendref=" + urlhash + "&urlmaskfilter=.*");
prop.put("type_results_" + i + "_description", descr);
prop.put("type_results_" + i + "_description", comp.descr());
prop.put("type_results_" + i + "_url", urlstring);
prop.put("type_results_" + i + "_urlhash", urlhash);
prop.put("type_results_" + i + "_urlhexhash", yacySeed.b64Hash2hexHash(urlhash));
@ -2196,19 +2195,18 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// determine the url string
plasmaCrawlLURLEntry entry = urlPool.loadedURL.load(urlhash, null);
if (entry == null) return 0;
URL url = entry.url();
if (url == null) return 0;
plasmaCrawlLURLEntry.Components comp = entry.comp();
if (comp.url() == null) return 0;
InputStream resourceContent = null;
try {
// get the resource content
Object[] resource = snippetCache.getResource(url, fetchOnline, 10000);
Object[] resource = snippetCache.getResource(comp.url(), fetchOnline, 10000);
resourceContent = (InputStream) resource[0];
Long resourceContentLength = (Long) resource[1];
// parse the resource
plasmaParserDocument document = snippetCache.parseDocument(url, resourceContentLength.longValue(), resourceContent);
plasmaParserDocument document = snippetCache.parseDocument(comp.url(), resourceContentLength.longValue(), resourceContent);
// getting parsed body input stream
InputStream docBodyInputStream = document.getText();

@ -334,7 +334,7 @@ public class plasmaSwitchboardQueue {
if (referrerURL == null) {
if ((referrerHash == null) || (referrerHash.equals(indexURL.dummyHash))) return null;
plasmaCrawlLURLEntry entry = lurls.load(referrerHash, null);
if (entry == null) referrerURL = null; else referrerURL = entry.url();
if (entry == null) referrerURL = null; else referrerURL = entry.comp().url();
}
return referrerURL;
}

@ -84,7 +84,7 @@ public class plasmaURLPool {
if (ne != null) return ne.url();
} catch (IOException e) {}
plasmaCrawlLURLEntry le = loadedURL.load(urlhash, null);
if (le != null) return le.url();
if (le != null) return le.comp().url();
plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);
if (ee != null) return ee.url();
return null;

@ -780,7 +780,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
if (ue == null) {
urlHashs.add(entry.urlHash());
} else {
url = ue.url();
url = ue.comp().url();
if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url) == true)) {
urlHashs.add(entry.urlHash());
}

@ -46,7 +46,7 @@ public class bitfield {
public bitfield(int bytelength) {
this.bb= new byte[bytelength];
for (int i = 0 ; i < bytelength; i++) bb[i] = (char) 48;
for (int i = 0 ; i < bytelength; i++) bb[i] = 0;
}
public bitfield(byte[] field) {

@ -502,7 +502,9 @@ public final class yacyClient {
for (int n = 0; n < results; n++) {
// get one single search result
urlEntry = urlManager.newEntry((String) result.get("resource" + n), true);
if ((urlEntry == null) || (blacklist.isListed(plasmaURLPattern.BLACKLIST_SEARCH, urlEntry.url()))) { continue; } // block with backlist
if (urlEntry == null) continue;
plasmaCrawlLURLEntry.Components comp = urlEntry.comp();
if (blacklist.isListed(plasmaURLPattern.BLACKLIST_SEARCH, comp.url())) continue; // block with backlist
urlManager.store(urlEntry);
urlManager.stack(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
@ -510,19 +512,20 @@ public final class yacyClient {
final indexEntry entry;
if (urlEntry.word() == null) {
// the old way to define words
int urlLength = urlEntry.url().toString().length();
int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length;
int urlLength = comp.url().toNormalform().length();
int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()).length;
entry = new indexURLEntry(
urlEntry.hash(),
urlLength, urlComps,
urlEntry.descr().length(),
urlLength,
urlComps,
comp.descr().length(),
urlEntry.wordCount(),
0, 0, 0, 0, 0, 0,
urlEntry.size(),
urlEntry.moddate().getTime(),
System.currentTimeMillis(),
urlEntry.quality(),
0,
urlEntry.language(),
urlEntry.doctype(),
0,0,

@ -958,7 +958,8 @@ public final class yacy {
while (eiter.hasNext()) {
try {
entry = (plasmaCrawlLURLEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), null);
plasmaCrawlLURLEntry.Components comp = entry.comp();
if ((entry != null) && (comp.url() != null)) doms.put(comp.url().getHost(), null);
} catch (Exception e) {
// here a MalformedURLException may occur
// just ignore
@ -1068,12 +1069,13 @@ public final class yacy {
plasmaCrawlLURLEntry entry;
while (eiter.hasNext()) {
entry = (plasmaCrawlLURLEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) {
plasmaCrawlLURLEntry.Components comp = entry.comp();
if ((entry != null) && (comp.url() != null)) {
if (html) {
bos.write(("<a href=\"" + entry.url() + "\">" + entry.descr() + "</a><br>").getBytes("UTF-8"));
bos.write(("<a href=\"" + comp.url().toNormalform() + "\">" + comp.descr() + "</a><br>").getBytes("UTF-8"));
bos.write(serverCore.crlf);
} else {
bos.write(entry.url().toString().getBytes());
bos.write(comp.url().toNormalform().getBytes());
bos.write(serverCore.crlf);
}
}
@ -1128,7 +1130,8 @@ public final class yacy {
plasmaCrawlLURLEntry entry;
while (eiter.hasNext()) {
entry = (plasmaCrawlLURLEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) {
plasmaCrawlLURLEntry.Components comp = entry.comp();
if ((entry != null) && (comp.url() != null)) {
fsp.put(entry.toRowEntry(), entry.loaddate());
}
}

Loading…
Cancel
Save