From b79e06615d45eccea7129a2a8a7f89936221d756 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 18 Oct 2006 22:25:07 +0000 Subject: [PATCH] - added new LURL.Entry class for next database migration - refactoring of affected classes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2802 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Bookmarks.java | 14 +- htroot/IndexControl_p.html | 3 - htroot/IndexControl_p.java | 22 +- htroot/PerformanceMemory_p.java | 1 + htroot/ViewFile.java | 43 ++- htroot/htdocsdefault/dir.java | 2 +- htroot/yacy/crawlReceipt.java | 17 +- htroot/yacy/search.java | 2 +- htroot/yacy/transferURL.java | 40 ++- htroot/yacysearch.java | 10 +- source/de/anomic/index/indexURL.java | 1 + .../kelondro/kelondroCollectionIndex.java | 5 +- source/de/anomic/kelondro/kelondroRow.java | 32 +- source/de/anomic/plasma/plasmaCrawlLURL.java | 26 +- .../anomic/plasma/plasmaCrawlLURLEntry.java | 37 +- .../plasma/plasmaCrawlLURLNewEntry.java | 337 ++++++++++++++++++ .../plasma/plasmaCrawlLURLOldEntry.java | 55 +-- source/de/anomic/plasma/plasmaDHTChunk.java | 2 +- .../de/anomic/plasma/plasmaSearchImages.java | 2 +- .../plasma/plasmaSearchRankingProfile.java | 9 +- .../de/anomic/plasma/plasmaSearchResult.java | 15 +- .../de/anomic/plasma/plasmaSnippetCache.java | 10 +- .../de/anomic/plasma/plasmaSwitchboard.java | 38 +- .../anomic/plasma/plasmaSwitchboardQueue.java | 2 +- source/de/anomic/plasma/plasmaURLPool.java | 2 +- source/de/anomic/plasma/plasmaWordIndex.java | 2 +- source/de/anomic/tools/bitfield.java | 2 +- source/de/anomic/yacy/yacyClient.java | 15 +- source/yacy.java | 13 +- 29 files changed, 553 insertions(+), 206 deletions(-) create mode 100644 source/de/anomic/plasma/plasmaCrawlLURLNewEntry.java diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index ddc520540..111169f3a 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -149,15 +149,15 @@ public class Bookmarks { // try to get the bookmark from the LURL database plasmaCrawlLURLEntry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null); plasmaParserDocument document = null; - if(urlentry != null){ - document = switchboard.snippetCache.retrieveDocument(urlentry.url(), true); - } if (urlentry != null) { + plasmaCrawlLURLEntry.Components comp = urlentry.comp(); + document = switchboard.snippetCache.retrieveDocument(comp.url(), true); prop.put("mode_edit", 0); // create mode - prop.put("mode_title", urlentry.descr()); - prop.put("mode_description", (document == null) ? urlentry.descr() : document.getMainLongTitle()); - prop.put("mode_url", urlentry.url()); - prop.put("mode_tags", (document == null) ? "" : document.getKeywords(',')); + prop.put("mode_url", comp.url().toNormalform()); + prop.put("mode_title", comp.descr()); + prop.put("mode_description", (document == null) ? comp.descr(): document.getMainLongTitle()); + prop.put("mode_author", comp.author()); + prop.put("mode_tags", (document == null) ? comp.tags() : document.getKeywords(',')); prop.put("mode_public", 0); } if (document != null) document.close(); diff --git a/htroot/IndexControl_p.html b/htroot/IndexControl_p.html index ada516dc4..47fc1564d 100644 --- a/htroot/IndexControl_p.html +++ b/htroot/IndexControl_p.html @@ -164,9 +164,6 @@ Loaded-Date#[loaddate]# Referrer#[referrer]# Doctype#[doctype]# - Copy-Count#[copyCount]# - Local-Flag#[local]# - Quality#[quality]# Language#[language]# Size#[size]# Words#[wordCount]# diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index c1c4381aa..73d44636f 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -222,8 +222,7 @@ public class IndexControl_p { if (entry == null) { prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); } else { - URL url = entry.url(); - urlstring = url.toNormalform(); + urlstring = entry.comp().url().toNormalform(); prop.put("urlstring", ""); switchboard.urlPool.loadedURL.remove(urlhash); prop.put("result", "Removed URL " + urlstring); @@ -339,9 +338,7 @@ public class IndexControl_p { if (entry == null) { prop.put("result", "No Entry for URL hash " + urlhash); } else { - URL url = entry.url(); - urlstring = url.toString(); - prop.put("urlstring", urlstring); + prop.put("urlstring", entry.comp().url().toNormalform()); prop.putAll(genUrlProfile(switchboard, entry, urlhash)); } } @@ -410,30 +407,27 @@ public class IndexControl_p { prop.put("genUrlProfile_urlhash", urlhash); return prop; } - URL url = entry.url(); + plasmaCrawlLURLEntry.Components comp = entry.comp(); String referrer = null; plasmaCrawlLURLEntry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null); if (le == null) { referrer = ""; } else { - referrer = le.url().toString(); + referrer = le.comp().url().toNormalform(); } - if (url == null) { + if (comp.url() == null) { prop.put("genUrlProfile", 1); prop.put("genUrlProfile_urlhash", urlhash); return prop; } prop.put("genUrlProfile", 2); - prop.put("genUrlProfile_urlNormalform", url.toNormalform()); + prop.put("genUrlProfile_urlNormalform", comp.url().toNormalform()); prop.put("genUrlProfile_urlhash", urlhash); - prop.put("genUrlProfile_urlDescr", entry.descr()); + prop.put("genUrlProfile_urlDescr", comp.descr()); prop.put("genUrlProfile_moddate", entry.moddate()); prop.put("genUrlProfile_loaddate", entry.loaddate()); prop.put("genUrlProfile_referrer", referrer); prop.put("genUrlProfile_doctype", ""+entry.doctype()); - prop.put("genUrlProfile_copyCount", entry.copyCount()); - prop.put("genUrlProfile_local", ""+entry.local()); - prop.put("genUrlProfile_quality", entry.quality()); prop.put("genUrlProfile_language", entry.language()); prop.put("genUrlProfile_size", entry.size()); prop.put("genUrlProfile_wordCount", entry.wordCount()); @@ -467,7 +461,7 @@ public class IndexControl_p { if (le == null) { tm.put(uh[0], uh); } else { - us = le.url().toString(); + us = le.comp().url().toNormalform(); tm.put(us, uh); } diff --git a/htroot/PerformanceMemory_p.java b/htroot/PerformanceMemory_p.java index 07e689a21..b880900a8 100644 --- a/htroot/PerformanceMemory_p.java +++ b/htroot/PerformanceMemory_p.java @@ -337,6 +337,7 @@ public class PerformanceMemory_p { } private static void putprop(serverObjects prop, serverSwitch env, String wdb, String db, String set) { + if ((slt == null) || (ost == null)) return; usd = chk * slt[1] + obj * ost[2] /*hit*/ + kelondroTree.cacheObjectMissSize * ost[3] /*miss*/; bst = (((((long) chk) * ((long) req)) >> 10) + 1) << 10; if (set.equals("setBest")) env.setConfig("ramCache" + db, bst); diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index ced7a6386..7302d7465 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -54,7 +54,6 @@ import java.util.Enumeration; import de.anomic.data.wikiCode; import de.anomic.http.httpHeader; import de.anomic.http.httpc; -import de.anomic.net.URL; import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParserDocument; @@ -116,12 +115,12 @@ public class ViewFile { } // gettin the url that belongs to the entry - URL url = urlEntry.url(); - if (url == null) { + plasmaCrawlLURLEntry.Components comp = urlEntry.comp(); + if ((comp == null) || (comp.url() == null)) { prop.put("error",3); prop.put("viewMode",VIEW_MODE_NO_TEXT); return prop; - } + } // loading the resource content as byte array InputStream resource = null; @@ -130,14 +129,14 @@ public class ViewFile { String resMime = null; try { // trying to load the resource body - resource = sb.cacheManager.getResourceContentStream(url); - resourceLength = sb.cacheManager.getResourceContentLength(url); + resource = sb.cacheManager.getResourceContentStream(comp.url()); + resourceLength = sb.cacheManager.getResourceContentLength(comp.url()); // if the resource body was not cached we try to load it from web if (resource == null) { plasmaHTCache.Entry entry = null; try { - entry = sb.snippetCache.loadResourceFromWeb(url, 5000, false); + entry = sb.snippetCache.loadResourceFromWeb(comp.url(), 5000, false); } catch (plasmaCrawlerException e) { prop.put("error",4); prop.put("error_errorText",e.getMessage()); @@ -147,8 +146,8 @@ public class ViewFile { if (entry != null) { resInfo = entry.getDocumentInfo(); - resource = sb.cacheManager.getResourceContentStream(url); - resourceLength = sb.cacheManager.getResourceContentLength(url); + resource = sb.cacheManager.getResourceContentStream(comp.url()); + resourceLength = sb.cacheManager.getResourceContentLength(comp.url()); } if (resource == null) { @@ -164,19 +163,19 @@ public class ViewFile { // try to load the metadata from cache try { - resInfo = sb.cacheManager.loadResourceInfo(urlEntry.url()); + resInfo = sb.cacheManager.loadResourceInfo(comp.url()); } catch (Exception e) { /* ignore this */} // if the metadata where not cached try to load it from web if (resInfo == null) { - String protocol = url.getProtocol(); + String protocol = comp.url().getProtocol(); if (!((protocol.equals("http") || protocol.equals("https")))) { prop.put("error",6); prop.put("viewMode",VIEW_MODE_NO_TEXT); return prop; } - httpHeader responseHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig); + httpHeader responseHeader = httpc.whead(comp.url(),comp.url().getHost(),5000,null,null,sb.remoteProxyConfig); if (responseHeader == null) { prop.put("error",4); prop.put("error_errorText","Unable to load resource metadata."); @@ -184,7 +183,7 @@ public class ViewFile { return prop; } try { - resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, responseHeader); + resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(comp.url(), responseHeader); } catch (Exception e) { prop.put("error",4); prop.put("error_errorText",e.getMessage()); @@ -230,12 +229,12 @@ public class ViewFile { prop.put("viewMode_plainText",content); } else if (viewMode.equals("iframe")) { prop.put("viewMode",VIEW_MODE_AS_IFRAME); - prop.put("viewMode_url",url.toString()); + prop.put("viewMode_url",comp.url().toNormalform()); } else if (viewMode.equals("parsed") || viewMode.equals("sentences")) { // parsing the resource content plasmaParserDocument document = null; try { - document = sb.snippetCache.parseDocument(url, resourceLength, resource,resInfo); + document = sb.snippetCache.parseDocument(comp.url(), resourceLength, resource,resInfo); if (document == null) { prop.put("error",5); prop.put("error_errorText","Unknown error"); @@ -295,13 +294,13 @@ public class ViewFile { } if (document != null) document.close(); } - prop.put("error",0); - prop.put("error_url",url.toString()); - prop.put("error_hash",urlHash); - prop.put("error_wordCount",Integer.toString(urlEntry.wordCount())); - prop.put("error_desc",urlEntry.descr()); - prop.put("error_size",urlEntry.size()); - prop.put("error_mimeType",resMime); + prop.put("error", 0); + prop.put("error_url", comp.url().toNormalform()); + prop.put("error_hash", urlHash); + prop.put("error_wordCount", Integer.toString(urlEntry.wordCount())); + prop.put("error_desc", comp.descr()); + prop.put("error_size", urlEntry.size()); + prop.put("error_mimeType", resMime); return prop; } diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java index b85223a75..cf29c69e7 100644 --- a/htroot/htdocsdefault/dir.java +++ b/htroot/htdocsdefault/dir.java @@ -359,7 +359,7 @@ public class dir { final URL url = new URL(urlstring); final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes())); final plasmaCrawlLURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry( - url, "YaCyShare: " + descr, new Date(), new Date(), + url.toNormalform(), "YaCyShare: " + descr, new Date(), new Date(), "AAAAAAAAAAAA", /*referrer*/ 0, /*copycount*/ false, /*localneed*/ diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index eb29cf5f6..57d295c29 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -125,8 +125,13 @@ public final class crawlReceipt { } else if (result.equals("fill")) { // generating a new loaded URL entry plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr, true); - if ((entry == null)||(entry.url()==null)) { - log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT for hash " + receivedUrlhash + " from peer " + iam + + if (entry == null) { + log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) for hash " + receivedUrlhash + " from peer " + iam + + "\n\tURL properties: "+ propStr); + } else { + plasmaCrawlLURLEntry.Components comp = entry.comp(); + if (comp.url() == null) { + log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + receivedUrlhash + " from peer " + iam + "\n\tURL properties: "+ propStr); } else try { // put new entry into database @@ -134,18 +139,18 @@ public final class crawlReceipt { switchboard.urlPool.loadedURL.stack(entry, youare, iam, 1); // generating url hash - String newUrlHash = indexURL.urlHash(entry.url()); - String oldUrlHash = indexURL.oldurlHash(entry.url()); + String newUrlHash = indexURL.urlHash(comp.url()); + String oldUrlHash = indexURL.oldurlHash(comp.url()); // removing URL from notice URL switchboard.urlPool.noticeURL.remove(newUrlHash); switchboard.urlPool.noticeURL.remove(oldUrlHash); - log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + receivedUrlhash + ":" + entry.url()); + log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + receivedUrlhash + ":" + comp.url().toNormalform()); } catch (IOException e) { e.printStackTrace(); } - + } // ready for more prop.put("delay", "10"); } else { diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 6f687d874..a30d41367 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -249,7 +249,7 @@ public final class search { while ((acc.hasMoreElements()) && (i < squery.wantedResults)) { urlentry = (plasmaCrawlLURLEntry) acc.nextElement(); if (includesnippet) { - snippet = sb.snippetCache.retrieveSnippet(urlentry.url(), squery.queryHashes, false, 260, 1000); + snippet = sb.snippetCache.retrieveSnippet(urlentry.comp().url(), squery.queryHashes, false, 260, 1000); } else { snippet = null; } diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index 9ae72dfb7..ac551bc81 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -98,25 +98,29 @@ public final class transferURL { yacyCore.log.logFine("transferURL: got null URL-string from peer " + otherPeerName); } else { lEntry = sb.urlPool.loadedURL.newEntry(urls, true); - if ((lEntry != null) && (lEntry.url() != null)) { - if ((blockBlacklist) && - (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), lEntry.url()))) { - int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash()); - yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs"); - lEntry = null; - blocked++; - } else try { - sb.urlPool.loadedURL.store(lEntry); - sb.urlPool.loadedURL.stack(lEntry, iam, iam, 3); - yacyCore.log.logFine("transferURL: received URL '" + lEntry.url() + "' from peer " + otherPeerName); - received++; - } catch (IOException e) { - e.printStackTrace(); - } - } else { - yacyCore.log.logWarning("transferURL: received invalid URL from peer " + otherPeerName + - "\n\tURL Property: " + urls); + if (lEntry == null) { + yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls); // TODO: should we send back an error message??? + } else { + plasmaCrawlLURLEntry.Components comp = lEntry.comp(); + if (comp.url() == null) { + yacyCore.log.logWarning("transferURL: received invalid URL (url null) from peer " + otherPeerName + "\n\tURL Property: " + urls); + // TODO: should we send back an error message??? + } else { + if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), comp.url()))) { + int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash()); + yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs"); + lEntry = null; + blocked++; + } else try { + sb.urlPool.loadedURL.store(lEntry); + sb.urlPool.loadedURL.stack(lEntry, iam, iam, 3); + yacyCore.log.logFine("transferURL: received URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName); + received++; + } catch (IOException e) { + e.printStackTrace(); + } + } } } } diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 71e7f8996..1721351fd 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -191,13 +191,15 @@ public class yacysearch { final String recommendHash = post.get("recommendref", ""); // urlhash plasmaCrawlLURLEntry urlentry = sb.urlPool.loadedURL.load(recommendHash, null); if (urlentry != null) { - plasmaParserDocument document = sb.snippetCache.retrieveDocument(urlentry.url(), true); + plasmaCrawlLURLEntry.Components comp = urlentry.comp(); + plasmaParserDocument document; + document = sb.snippetCache.retrieveDocument(comp.url(), true); if (document != null) { // create a news message HashMap map = new HashMap(); - map.put("url", urlentry.url().toNormalform().replace(',', '|')); - map.put("title", urlentry.descr().replace(',', ' ')); - map.put("description", ((document == null) ? urlentry.descr() : document.getMainLongTitle()).replace(',', ' ')); + map.put("url", comp.url().toNormalform().replace(',', '|')); + map.put("title", comp.descr().replace(',', ' ')); + map.put("description", ((document == null) ? comp.descr() : document.getMainLongTitle()).replace(',', ' ')); map.put("tags", ((document == null) ? "" : document.getKeywords(' '))); yacyCore.newsPool.publishMyNews(new yacyNewsRecord("stippadd", map)); document.close(); diff --git a/source/de/anomic/index/indexURL.java b/source/de/anomic/index/indexURL.java index c92380fc5..41cbfda73 100644 --- a/source/de/anomic/index/indexURL.java +++ b/source/de/anomic/index/indexURL.java @@ -52,6 +52,7 @@ public class indexURL { public static final int urlStringLength = 256;// not too short for links without parameters public static final int urlDescrLength = 80; // The headline of a web page (meta-tag or

) public static final int urlNameLength = 40; // the tag content between and + public static final int urldescrtagsLength = 320;// the url, the description and tags in one string public static final int urlErrorLength = 80; // a reason description for unavailable urls public static final int urlDateLength = 4; // any date, shortened public static final int urlCopyCountLength = 2; // counter for numbers of copies of this index diff --git a/source/de/anomic/kelondro/kelondroCollectionIndex.java b/source/de/anomic/kelondro/kelondroCollectionIndex.java index 2932f8d19..7a73b35eb 100644 --- a/source/de/anomic/kelondro/kelondroCollectionIndex.java +++ b/source/de/anomic/kelondro/kelondroCollectionIndex.java @@ -158,7 +158,10 @@ public class kelondroCollectionIndex { ientry.setCol(idx_col_indexpos, j); ientry.setCol(idx_col_lastread, t); ientry.setCol(idx_col_lastwrote, t); - index.put(ientry); + if (index instanceof kelondroBufferedIndex) + ((kelondroBufferedIndex) index).add(ientry); + else + index.put(ientry); // write a log if (System.currentTimeMillis() - lastlog > 30000) { diff --git a/source/de/anomic/kelondro/kelondroRow.java b/source/de/anomic/kelondro/kelondroRow.java index 846515795..f172aebcf 100644 --- a/source/de/anomic/kelondro/kelondroRow.java +++ b/source/de/anomic/kelondro/kelondroRow.java @@ -40,7 +40,7 @@ public class kelondroRow { protected kelondroColumn[] row; protected int[] colstart; protected int objectsize; - protected Map nickref = null; + protected Map nickref = null; // a mapping from nicknames to Object[2]{kelondroColumn, Integer(colstart)} public kelondroRow(kelondroColumn[] row) { this.row = row; @@ -142,7 +142,12 @@ public class kelondroRow { if (external == null) return null; return new Entry(external); } - + /* + public Entry newEntry(Properties prop) { + if (prop == null) return null; + return new Entry(prop); + } + */ public class Entry implements Comparable { private byte[] rowinstance; @@ -202,7 +207,19 @@ public class kelondroRow { } } } - + /* + public Entry(Properties prop) { + // parse external form + if (nickref == null) genNickRef(); + rowinstance = new byte[objectsize]; + Iterator i = prop.entrySet().iterator(); + Map.Entry entry; + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + setCol(((String) entry.getKey()).trim(), ((String) entry.getValue()).trim().getBytes()); + } + } + */ public int compareTo(Object o) { if (o instanceof Entry) { return kelondroNaturalOrder.naturalOrder.compare(this.rowinstance, ((Entry) o).rowinstance); @@ -354,7 +371,7 @@ public class kelondroRow { return getColLong(row[column].encoder(), colstart[column], row[column].cellwidth()); } - public long getColLong(int encoder, int offset, int length) { + private long getColLong(int encoder, int offset, int length) { // start - fix for badly stored parameters if ((length >= 3) && (rowinstance[offset] == '[') && (rowinstance[offset + 1] == 'B') && (rowinstance[offset + 2] == '@')) return 0; if ((length == 2) && (rowinstance[offset] == '[') && (rowinstance[offset + 1] == 'B')) return 0; @@ -378,6 +395,13 @@ public class kelondroRow { throw new kelondroException("ROW", "getColLong did not find appropriate encoding"); } + public byte getColByte(String nickname, byte dflt) { + if (nickref == null) genNickRef(); + Object[] ref = (Object[]) nickref.get(nickname); + if (ref == null) return dflt; + return rowinstance[((Integer) ref[1]).intValue()]; + } + public byte getColByte(int column) { return rowinstance[colstart[column]]; } diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 8ba925c57..52376ec02 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -195,7 +195,7 @@ public final class plasmaCrawlLURL extends indexURL { } } - public synchronized plasmaCrawlLURLEntry newEntry(URL url, String descr, Date moddate, Date loaddate, + public synchronized plasmaCrawlLURLEntry newEntry(String url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) { @@ -338,7 +338,6 @@ public final class plasmaCrawlLURL extends indexURL { String cachepath, urlstr, urltxt; yacySeed initiatorSeed, executorSeed; plasmaCrawlLURLEntry urle; - URL url; // needed for getCachePath(url) final plasmaSwitchboard switchboard = plasmaSwitchboard.getSwitchboard(); @@ -353,14 +352,14 @@ public final class plasmaCrawlLURL extends indexURL { // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash); try { urle = load(urlHash, null); + plasmaCrawlLURLEntry.Components comp = urle.comp(); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString()); initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash); executorSeed = yacyCore.seedDB.getConnected(executorHash); - url = urle.url(); - urlstr = url.toString(); + urlstr = comp.url().toNormalform(); urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL - cachepath = (url == null) ? "-not-cached-" : cacheManager.getCachePath(url).toString().replace('\\', '/').substring(cacheManager.cachePath.toString().length() + 1); + cachepath = cacheManager.getCachePath(new URL(urlstr)).toString().replace('\\', '/').substring(cacheManager.cachePath.toString().length() + 1); prop.put("table_indexed_" + cnt + "_dark", (dark) ? 1 : 0); prop.put("table_indexed_" + cnt + "_feedbackpage", feedbackpage); @@ -372,8 +371,8 @@ public final class plasmaCrawlLURL extends indexURL { prop.put("table_indexed_" + cnt + "_showExec_executorSeed", (executorSeed == null) ? dfltExec : executorSeed.getName()); prop.put("table_indexed_" + cnt + "_moddate", daydate(urle.moddate())); prop.put("table_indexed_" + cnt + "_wordcount", urle.wordCount()); - prop.put("table_indexed_" + cnt + "_urldescr", urle.descr()); - prop.put("table_indexed_" + cnt + "_url", (urle.url() == null) ? "-not-cached-" : ((makeLink) ? ("" + urltxt + "") : urlstr)); + prop.put("table_indexed_" + cnt + "_urldescr", comp.descr()); + prop.put("table_indexed_" + cnt + "_url", (cachepath == null) ? "-not-cached-" : ((makeLink) ? ("" + urltxt + "") : urlstr)); dark = !dark; cnt++; } catch (Exception e) { @@ -535,18 +534,19 @@ public final class plasmaCrawlLURL extends indexURL { } plasmaCrawlLURLEntry entry = (plasmaCrawlLURLEntry) eiter.next(); + plasmaCrawlLURLEntry.Components comp = entry.comp(); totalSearchedUrls++; - if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, entry.url()) || - plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, entry.url())) { - lastBlacklistedUrl = entry.url().toString(); + if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, comp.url()) || + plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, comp.url())) { + lastBlacklistedUrl = comp.url().toNormalform(); lastBlacklistedHash = entry.hash(); - serverLog.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double)blacklistedUrls/totalSearchedUrls)*100 + "%): " + entry.hash() + " " + entry.url()); + serverLog.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double)blacklistedUrls/totalSearchedUrls)*100 + "%): " + entry.hash() + " " + comp.url().toNormalform()); remove(entry.hash()); if (blacklistedUrls % 100 == 0) { serverLog.logInfo("URLDBCLEANER", "Deleted " + blacklistedUrls + " URLs until now. Last deleted URL-Hash: " + lastBlacklistedUrl); } } - lastUrl = entry.url().toString(); + lastUrl = comp.url().toNormalform(); lastHash = entry.hash(); } } catch (RuntimeException e) { @@ -605,7 +605,7 @@ public final class plasmaCrawlLURL extends indexURL { final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), 1, 0, false); final Iterator enu = urls.entries(true, false, null); while (enu.hasNext()) { - ((plasmaCrawlLURLEntry) enu.next()).print(); + System.out.println(((plasmaCrawlLURLEntry) enu.next()).toString()); } } catch (Exception e) { e.printStackTrace(); diff --git a/source/de/anomic/plasma/plasmaCrawlLURLEntry.java b/source/de/anomic/plasma/plasmaCrawlLURLEntry.java index b66c49c1c..18c859a6b 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURLEntry.java +++ b/source/de/anomic/plasma/plasmaCrawlLURLEntry.java @@ -27,10 +27,11 @@ package de.anomic.plasma; import java.io.IOException; +import java.net.MalformedURLException; import java.util.Date; -import de.anomic.net.URL; import de.anomic.kelondro.kelondroRow; +import de.anomic.net.URL; import de.anomic.index.indexEntry; public interface plasmaCrawlLURLEntry { @@ -39,9 +40,7 @@ public interface plasmaCrawlLURLEntry { public String hash(); - public URL url(); - - public String descr(); + public Components comp(); public Date moddate(); @@ -51,12 +50,6 @@ public interface plasmaCrawlLURLEntry { public char doctype(); - public int copyCount(); - - public boolean local(); - - public int quality(); - public String language(); public int size(); @@ -73,6 +66,26 @@ public interface plasmaCrawlLURLEntry { public String toString(); - public void print(); - + public class Components { + private URL url; + private String descr, author, tags, ETag; + + public Components(String url, String descr, String author, String tags, String ETag) { + try { + this.url = new URL(url); + } catch (MalformedURLException e) { + this.url = null; + } + this.descr = descr; + this.author = author; + this.tags = tags; + this.ETag = ETag; + } + public URL url() { return this.url; } + public String descr() { return this.descr; } + public String author() { return this.author; } + public String tags() { return this.tags; } + public String ETag() { return this.ETag; } + } + } diff --git a/source/de/anomic/plasma/plasmaCrawlLURLNewEntry.java b/source/de/anomic/plasma/plasmaCrawlLURLNewEntry.java new file mode 100644 index 000000000..bd00fe8d2 --- /dev/null +++ b/source/de/anomic/plasma/plasmaCrawlLURLNewEntry.java @@ -0,0 +1,337 @@ +package de.anomic.plasma; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.text.ParseException; +import java.util.Date; +import java.util.Properties; +import java.util.ArrayList; + +import de.anomic.index.indexEntry; +import de.anomic.index.indexURL; +import de.anomic.index.indexURLEntry; +import de.anomic.kelondro.kelondroNaturalOrder; +import de.anomic.kelondro.kelondroBase64Order; +import de.anomic.kelondro.kelondroRow; +import de.anomic.net.URL; +import de.anomic.server.serverByteBuffer; +import de.anomic.server.serverCodings; +import de.anomic.tools.crypt; +import de.anomic.tools.bitfield; +import de.anomic.tools.nxTools; + +public class plasmaCrawlLURLNewEntry implements plasmaCrawlLURLEntry { + + public static final kelondroRow rowdef = new kelondroRow( + "String hash-12, " + // the url's hash + "String comp-360, " + // components: the url, description, author and tags. As 5th element, an ETag is possible + "Cardinal mod-4 {b256}, " + // last-modified from the httpd + "Cardinal load-4 {b256}, " + // time when the url was loaded + "String referrer-12, " + // (one of) the url's referrer hash(es) + "byte[] md5-8" + // the md5 of the url content (to identify changes) + "Cardinal size-6 {b256}, " + // size of file in bytes + "Cardinal wc-3 {b256}, " + // size of file by number of words; for video and audio: seconds + "byte[] dt-1, " + // doctype, taken from extension or any other heuristic + "byte[] flags-4, " + // flags; any stuff (see Word-Entity definition) + "String lang-2, " + // language + "Cardinal llocal-2 {b256}, " + // # of outlinks to same domain; for video and image: width + "Cardinal lother-2 {b256}, " + // # of outlinks to outside domain; for video and image: height + "Cardinal limage-2 {b256}, " + // # of embedded image links + "Cardinal laudio-2 {b256}, " + // # of embedded audio links; for audio: track number; for video: number of audio tracks + "Cardinal lvideo-2 {b256}, " + // # of embedded video links + "Cardinal lapp-2 {b256}"); // # of embedded links to applications + + private kelondroRow.Entry entry; + private String snippet; + private indexEntry word; // this is only used if the url is transported via remote search requests + + public plasmaCrawlLURLNewEntry( + URL url, + String descr, + String author, + String tags, + String ETag, + Date mod, + Date load, + String referrer, + byte[] md5, + long size, + int wc, + byte dt, + bitfield flags, + String lang, + int llocal, + int lother, + int laudio, + int limage, + int lvideo, + int lapp) { + // create new entry and store it into database + this.entry = rowdef.newEntry(); + this.entry.setCol("hash", indexURL.urlHash(url), null); + this.entry.setCol("comp", encodeComp(url, descr, author, tags, ETag)); + this.entry.setCol("mod", encodeDate(mod)); + this.entry.setCol("load", encodeDate(load)); + this.entry.setCol("referrer", referrer.getBytes()); + this.entry.setCol("md5", md5); + this.entry.setCol("size", size); + this.entry.setCol("wc", wc); + this.entry.setCol("dt", dt); + this.entry.setCol("flags", flags.getBytes()); + this.entry.setCol("lang", lang.getBytes()); + this.entry.setCol("llocal", llocal); + this.entry.setCol("lother", lother); + this.entry.setCol("limage", limage); + this.entry.setCol("laudio", laudio); + this.entry.setCol("lvideo", lvideo); + this.entry.setCol("lapp", lapp); + this.snippet = null; + this.word = null; + } + + byte[] encodeDate(Date d) { + return kelondroNaturalOrder.encodeLong(d.getTime() / 86400000, 4); + } + + byte[] encodeComp(URL url, String descr, String author, String tags, String ETag) { + serverByteBuffer s = new serverByteBuffer(200); + s.append(url.toNormalform()).append((char) 10); + s.append(author).append((char) 10); + s.append(tags).append((char) 10); + s.append(ETag).append((char) 10); + return s.getBytes(); + } + + public plasmaCrawlLURLNewEntry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException { + this.entry = entry; + this.snippet = null; + this.word = searchedWord; + } + + public plasmaCrawlLURLNewEntry(Properties prop, boolean setGlobal) throws IOException { + // generates an plasmaLURLEntry using the properties from the argument + // the property names must correspond to the one from toString + //System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); + URL url; + try { + url = new URL(crypt.simpleDecode(prop.getProperty("url", ""), null)); + } catch (MalformedURLException e) { + throw new IOException("URL is not proper: " + crypt.simpleDecode(prop.getProperty("url", ""), null)); + } + String descr = crypt.simpleDecode(prop.getProperty("descr", ""), null); if (descr == null) descr = ""; + String author = crypt.simpleDecode(prop.getProperty("author", ""), null); if (author == null) author = ""; + String tags = crypt.simpleDecode(prop.getProperty("tags", ""), null); if (tags == null) tags = ""; + String ETag = crypt.simpleDecode(prop.getProperty("ETag", ""), null); if (ETag == null) ETag = ""; + + this.entry = rowdef.newEntry(); + this.entry.setCol("hash", indexURL.urlHash(url), null); + this.entry.setCol("comp", encodeComp(url, descr, author, tags, ETag)); + try { + this.entry.setCol("mod", encodeDate(indexURL.shortDayFormatter.parse(prop.getProperty("mod", "20000101")))); + } catch (ParseException e) { + this.entry.setCol("mod", encodeDate(new Date())); + } + try { + this.entry.setCol("load", encodeDate(indexURL.shortDayFormatter.parse(prop.getProperty("load", "20000101")))); + } catch (ParseException e) { + this.entry.setCol("load", encodeDate(new Date())); + } + this.entry.setCol("referrer", prop.getProperty("referrer", indexURL.dummyHash).getBytes()); + this.entry.setCol("md5", serverCodings.decodeHex(prop.getProperty("md5", indexURL.dummyHash))); + this.entry.setCol("size", Integer.parseInt(prop.getProperty("size", "0"))); + this.entry.setCol("wc", Integer.parseInt(prop.getProperty("wc", "0"))); + this.entry.setCol("dt", prop.getProperty("dt", "t").charAt(0)); + this.entry.setCol("flags", serverCodings.decodeHex(prop.getProperty("flags", "00000000"))); + this.entry.setCol("lang", prop.getProperty("lang", "uk").getBytes()); + this.entry.setCol("llocal", Integer.parseInt(prop.getProperty("llocal", "0"))); + this.entry.setCol("lother", Integer.parseInt(prop.getProperty("lother", "0"))); + this.entry.setCol("limage", Integer.parseInt(prop.getProperty("limage", "0"))); + this.entry.setCol("laudio", Integer.parseInt(prop.getProperty("laudio", "0"))); + this.entry.setCol("lvideo", Integer.parseInt(prop.getProperty("lvideo", "0"))); + this.entry.setCol("lapp", Integer.parseInt(prop.getProperty("lapp", "0"))); + this.snippet = crypt.simpleDecode(prop.getProperty("snippet", ""), null); + this.word = (prop.containsKey("word")) ? new indexURLEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word", ""))) : null; + } + + private StringBuffer corePropList() { + // generate a parseable string; this is a simple property-list + plasmaCrawlLURLEntry.Components comp = this.comp(); + final StringBuffer s = new StringBuffer(300); + try { + s.append("hash=").append(hash()); + s.append(",url=").append(crypt.simpleEncode(comp.url().toNormalform())); + s.append(",descr=").append(crypt.simpleEncode(comp.descr())); + s.append(",author=").append(crypt.simpleEncode(comp.author())); + s.append(",tags=").append(crypt.simpleEncode(comp.tags())); + s.append(",ETag=").append(crypt.simpleEncode(comp.ETag())); + s.append(",mod=").append(indexURL.shortDayFormatter.format(moddate())); + s.append(",load=").append(indexURL.shortDayFormatter.format(loaddate())); + s.append(",referrer=").append(referrerHash()); + s.append(",md5=").append(md5()); + s.append(",size=").append(size()); + s.append(",wc=").append(wordCount()); + s.append(",dt=").append(doctype()); + s.append(",flags=").append(serverCodings.encodeHex(flags().getBytes())); + s.append(",lang=").append(language()); + s.append(",llocal=").append(llocal()); + s.append(",lother=").append(lother()); + s.append(",limage=").append(limage()); + s.append(",laudio=").append(laudio()); + s.append(",lvideo=").append(lvideo()); + s.append(",lapp=").append(lapp()); + + if (this.word != null) { + // append also word properties + s.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm(false))); + } + return s; + + } catch (Exception e) { + // serverLog.logFailure("plasmaLURL.corePropList", e.getMessage()); + // if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null"); + // if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null"); + // e.printStackTrace(); + return null; + } + } + + public kelondroRow.Entry toRowEntry() throws IOException { + return this.entry; + } + + public String hash() { + // return a url-hash, based on the md5 algorithm + // the result is a String of 12 bytes within a 72-bit space + // (each byte has an 6-bit range) + // that should be enough for all web pages on the world + return this.entry.getColString("hash", "", null); + } + + public de.anomic.plasma.plasmaCrawlLURLEntry.Components comp() { + ArrayList cl = nxTools.strings(this.entry.getCol("comp", null), "UTF-8"); + return new de.anomic.plasma.plasmaCrawlLURLEntry.Components( + (cl.size() > 0) ? (String) cl.get(0) : "", + (cl.size() > 1) ? (String) cl.get(1) : "", + (cl.size() > 2) ? (String) cl.get(2) : "", + (cl.size() > 3) ? (String) cl.get(3) : "", + (cl.size() > 4) ? (String) cl.get(4) : ""); + } + + public Date moddate() { + return new Date(86400000 * entry.getColLong("mod", 0)); + } + + public Date loaddate() { + return new Date(86400000 * entry.getColLong("load", 0)); + } + + public String referrerHash() { + // return the creator's hash + return entry.getColString("referrer", indexURL.dummyHash, null); + } + + public String md5() { + // returns the md5 in hex representation + return serverCodings.encodeHex(entry.getCol("md5", indexURL.dummyHash.getBytes())); + } + + public char doctype() { + return (char) entry.getColByte("dt", (byte) 't'); + } + + public String language() { + return this.entry.getColString("lang", "uk", null); + } + + public int size() { + return (int) this.entry.getColLong("size", 0); + } + + public bitfield flags() { + return new bitfield(this.entry.getCol("flags", new byte[4])); + } + + public int wordCount() { + return (int) this.entry.getColLong("wc", 0); + } + + public int llocal() { + return (int) this.entry.getColLong("llocal", 0); + } + + public int lother() { + return (int) this.entry.getColLong("lother", 0); + } + + public int limage() { + return (int) this.entry.getColLong("limage", 0); + } + + public int laudio() { + return (int) this.entry.getColLong("laudio", 0); + } + + public int lvideo() { + return (int) this.entry.getColLong("lvideo", 0); + } + + public int lapp() { + return (int) this.entry.getColLong("lapp", 0); + } + + public String snippet() { + // the snippet may appear here if the url was transported in a remote search + // it will not be saved anywhere, but can only be requested here + return snippet; + } + + public indexEntry word() { + return word; + } + + public boolean isOlder(plasmaCrawlLURLEntry other) { + if (other == null) return false; + Date tmoddate = moddate(); + Date omoddate = other.moddate(); + if (tmoddate.before(omoddate)) return true; + if (tmoddate.equals(omoddate)) { + Date tloaddate = loaddate(); + Date oloaddate = other.loaddate(); + if (tloaddate.before(oloaddate)) return true; + if (tloaddate.equals(oloaddate)) return true; + } + return false; + } + + public String toString(String snippet) { + // add information needed for remote transport + final StringBuffer core = corePropList(); + if (core == null) + return null; + + core.ensureCapacity(core.length() + snippet.length() * 2); + core.insert(0, "{"); + core.append(",snippet=").append(crypt.simpleEncode(snippet)); + core.append("}"); + + return core.toString(); + //return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}"; + } + + /** + * Returns this object as String.
+ * This e.g. looks like this: + *
{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}
+ */ + public String toString() { + final StringBuffer core = corePropList(); + if (core == null) return null; + + core.insert(0, "{"); + core.append("}"); + + return core.toString(); + //return "{" + core + "}"; + } + +} diff --git a/source/de/anomic/plasma/plasmaCrawlLURLOldEntry.java b/source/de/anomic/plasma/plasmaCrawlLURLOldEntry.java index b6c9aa09f..570711e98 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURLOldEntry.java +++ b/source/de/anomic/plasma/plasmaCrawlLURLOldEntry.java @@ -36,7 +36,6 @@ import de.anomic.index.indexURL; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroRow; -import de.anomic.net.URL; import de.anomic.server.logging.serverLog; import de.anomic.tools.crypt; @@ -57,7 +56,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { "Cardinal size-" + indexURL.urlSizeLength + " {b64e}, " + // size of file in bytes "Cardinal wc-" + indexURL.urlWordCountLength + " {b64e}"); // word count - private URL url; + private String url; private String descr; private Date moddate; private Date loaddate; @@ -73,19 +72,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { private String snippet; private indexEntry word; // this is only used if the url is transported via remote search requests - // more needed attributes: - // - author / copyright owner - // - keywords - // - phrasecount, total number of phrases - // - boolean: URL attributes (see Word-Entity definition) - // - boolean: appearance of bold and/or italics - // - ETag: for re-crawl decision upon HEAD request - // - int: # of outlinks to same domain - // - int: # of outlinks to outside domain - // - int: # of keywords - // - int: # der auf der Seite vorhandenen Links zu image, audio, video, applications - - public plasmaCrawlLURLOldEntry(URL url, String descr, Date moddate, + public plasmaCrawlLURLOldEntry(String url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) { @@ -110,7 +97,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { public plasmaCrawlLURLOldEntry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException { try { this.urlHash = entry.getColString(0, null); - this.url = new URL(entry.getColString(1, "UTF-8").trim()); + this.url = entry.getColString(1, "UTF-8").trim(); this.descr = (entry.empty(2)) ? this.url.toString() : entry.getColString(2, "UTF-8").trim(); this.moddate = new Date(86400000 * entry.getColLong(3)); this.loaddate = new Date(86400000 * entry.getColLong(4)); @@ -144,7 +131,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { this.copyCount = Integer.parseInt(prop.getProperty("cc", "0")); this.flags = ((prop.getProperty("local", "true").equals("true")) ? "L " : " "); if (setGlobal) this.flags = "G "; - this.url = new URL(crypt.simpleDecode(prop.getProperty("url", ""), null)); + this.url = crypt.simpleDecode(prop.getProperty("url", ""), null); this.descr = crypt.simpleDecode(prop.getProperty("descr", ""), null); if (this.descr == null) this.descr = this.url.toString(); this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(prop.getProperty("q", "")); @@ -195,13 +182,9 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { // that should be enough for all web pages on the world return this.urlHash; } - - public URL url() { - return url; - } - - public String descr() { - return descr; + + public Components comp() { + return new Components(url, descr, "", "", ""); } public Date moddate() { @@ -263,9 +246,7 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { if (moddate.before(other.moddate())) return true; if (moddate.equals(other.moddate())) { if (loaddate.before(other.loaddate())) return true; - if (loaddate.equals(other.loaddate())) { - if (quality < other.quality()) return true; - } + if (loaddate.equals(other.loaddate())) return true; } return false; } @@ -297,30 +278,10 @@ public class plasmaCrawlLURLOldEntry implements plasmaCrawlLURLEntry { return corePropStr; } catch (Exception e) { - // serverLog.logFailure("plasmaLURL.corePropList", e.getMessage()); - // if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null"); - // if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null"); - // e.printStackTrace(); return null; } } - /* - public String toString(int posintext, int posinphrase, int posofphrase) { - // add information needed for remote transport - final StringBuffer core = corePropList(); - if (core == null) return null; - - core.ensureCapacity(core.length() + 200); - core.insert(0,"{") - .append(",posintext=").append(posintext) - .append(",posinphrase=").append(posinphrase) - .append(",posofphraseint=").append(posofphrase) - .append("}"); - return core.toString(); - } - */ - public String toString(String snippet) { // add information needed for remote transport final StringBuffer core = corePropList(); diff --git a/source/de/anomic/plasma/plasmaDHTChunk.java b/source/de/anomic/plasma/plasmaDHTChunk.java index c827ee6af..a7eea14e0 100644 --- a/source/de/anomic/plasma/plasmaDHTChunk.java +++ b/source/de/anomic/plasma/plasmaDHTChunk.java @@ -227,7 +227,7 @@ public class plasmaDHTChunk { while ((urlIter.hasNext()) && (maxcount > refcount) && (System.currentTimeMillis() < timeout)) { iEntry = (indexEntry) urlIter.next(); lurl = lurls.load(iEntry.urlHash(), iEntry); - if ((lurl == null) || (lurl.url() == null)) { + if ((lurl == null) || (lurl.comp().url() == null)) { //yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash()); notBoundCounter++; urlIter.remove(); diff --git a/source/de/anomic/plasma/plasmaSearchImages.java b/source/de/anomic/plasma/plasmaSearchImages.java index d849b394d..78a834304 100644 --- a/source/de/anomic/plasma/plasmaSearchImages.java +++ b/source/de/anomic/plasma/plasmaSearchImages.java @@ -104,7 +104,7 @@ public final class plasmaSearchImages { plasmaCrawlLURLEntry urlentry; while (sres.hasMoreElements()) { urlentry = sres.nextElement(); - addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), urlentry.url(), depth)); + addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), urlentry.comp().url(), depth)); } } diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java index 355f60839..b628ed45b 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java @@ -197,8 +197,9 @@ public class plasmaSearchRankingProfile { long ranking = preranking; // prefer hit with 'prefer' pattern - if (page.url().toString().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue(); - if (page.descr().toString().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue(); + plasmaCrawlLURLEntry.Components comp = page.comp(); + if (comp.url().toNormalform().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue(); + if (comp.descr().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue(); // apply 'common-sense' heuristic using references for (int j = 0; j < urlcomps.length; j++) { @@ -220,11 +221,11 @@ public class plasmaSearchRankingProfile { } // prefer short urls - ranking += (256 - page.url().toString().length()) << ((Integer) coeff.get(URLLENGTH)).intValue(); + ranking += (256 - comp.url().toNormalform().length()) << ((Integer) coeff.get(URLLENGTH)).intValue(); ranking += (8 * Math.max(0, 32 - urlcomps.length)) << ((Integer) coeff.get(URLCOMPS)).intValue(); // prefer long descriptions - ranking += (256 * page.descr().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue(); + ranking += (256 * comp.url().toNormalform().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue(); ranking += (256 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue(); return ranking; diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java index 0a2234ce3..0878c2350 100644 --- a/source/de/anomic/plasma/plasmaSearchResult.java +++ b/source/de/anomic/plasma/plasmaSearchResult.java @@ -108,11 +108,10 @@ public final class plasmaSearchResult { protected void addResult(plasmaCrawlLURLEntry page, Long preranking) { // take out relevant information for reference computation - URL url = page.url(); - String descr = page.descr(); - if ((url == null) || (descr == null)) return; - String[] urlcomps = htmlFilterContentScraper.urlComps(url.toString()); // word components of the url - String[] descrcomps = descr.toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description + plasmaCrawlLURLEntry.Components comp = page.comp(); + if ((comp.url() == null) || (comp.descr() == null)) return; + String[] urlcomps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()); // word components of the url + String[] descrcomps = comp.descr().toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description // store everything results.add(new Object[] {page, urlcomps, descrcomps, preranking}); @@ -168,12 +167,12 @@ public final class plasmaSearchResult { Iterator i = pageAcc.entrySet().iterator(); HashMap paths = new HashMap(); // a url-subpath to pageAcc-key relation Map.Entry entry; - String path; + String path = null; // first scan all entries and find all urls that are referenced while (i.hasNext()) { entry = (Map.Entry) i.next(); - path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).url()); + path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).comp().url()); paths.put(path, entry.getKey()); //if (path != null) path = shortenPath(path); //if (path != null) paths.put(path, entry.getKey()); @@ -184,7 +183,7 @@ public final class plasmaSearchResult { String shorten; while (i.hasNext()) { entry = (Map.Entry) i.next(); - path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).url()); + path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).comp().url()); shorten = shortenPath(path); // scan all subpaths of the url while (shorten != null) { diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index e6e6516aa..503570692 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -289,6 +289,7 @@ public class plasmaSnippetCache { * @return the parsed document as {@link plasmaParserDocument} */ public plasmaParserDocument retrieveDocument(URL url, boolean fetchOnline) { + if (url == null) return null; IResourceInfo docInfo = null; try { // trying to load the resource body from cache @@ -634,11 +635,12 @@ public class plasmaSnippetCache { long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) { urlentry = acc.nextElement(); - if (urlentry.url().getHost().endsWith(".yacyh")) continue; - urlstring = urlentry.url().toNormalform(); + plasmaCrawlLURLEntry.Components comp = urlentry.comp(); + if (comp.url().getHost().endsWith(".yacyh")) continue; + urlstring = comp.url().toNormalform(); if ((urlstring.matches(urlmask)) && - (!(existsInCache(urlentry.url(), queryhashes)))) { - new Fetcher(urlentry.url(), queryhashes, (int) maxTime).start(); + (!(existsInCache(comp.url(), queryhashes)))) { + new Fetcher(comp.url(), queryhashes, (int) maxTime).start(); i++; } } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 62fc2ea03..16ac55d03 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1559,7 +1559,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // create a new loaded URL db entry plasmaCrawlLURLEntry newEntry = urlPool.loadedURL.newEntry( - entry.url(), // URL + entry.url().toNormalform(), // URL docDescription, // document description docDate, // modification date new Date(), // loaded date @@ -1641,8 +1641,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String language = indexEntryAttribute.language(entry.url()); char doctype = indexEntryAttribute.docType(document.getMimeType()); - int urlLength = newEntry.url().toString().length(); - int urlComps = htmlFilterContentScraper.urlComps(newEntry.url().toString()).length; + plasmaCrawlLURLEntry.Components comp = newEntry.comp(); + int urlLength = comp.url().toNormalform().length(); + int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()).length; // iterate over all words Iterator i = condenser.words(); @@ -2046,10 +2047,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser prop.put("type_globalresults", acc.globalContributions); int i = 0; int p; - URL url; plasmaCrawlLURLEntry urlentry; String urlstring, urlname, filename, urlhash; - String host, hash, address, descr = ""; + String host, hash, address; yacySeed seed; plasmaSnippetCache.Snippet snippet; boolean includeSnippets = false; @@ -2058,30 +2058,29 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (targetTime < System.currentTimeMillis()) targetTime = System.currentTimeMillis() + 1000; while ((acc.hasMoreElements()) && (i < query.wantedResults) && (System.currentTimeMillis() < targetTime)) { urlentry = acc.nextElement(); - url = urlentry.url(); + plasmaCrawlLURLEntry.Components comp = urlentry.comp(); urlhash = urlentry.hash(); - host = url.getHost(); + host = comp.url().getHost(); if (host.endsWith(".yacyh")) { // translate host into current IP p = host.indexOf("."); hash = yacySeed.hexHash2b64Hash(host.substring(p + 1, host.length() - 6)); seed = yacyCore.seedDB.getConnected(hash); - filename = url.getFile(); + filename = comp.url().getFile(); if ((seed == null) || ((address = seed.getAddress()) == null)) { // seed is not known from here - removeReferences(urlentry.hash(), plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + urlentry.descr()).getBytes())); + removeReferences(urlentry.hash(), plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes())); urlPool.loadedURL.remove(urlentry.hash()); // clean up continue; // next result } - url = new URL("http://" + address + "/" + host.substring(0, p) + filename); urlname = "http://share." + seed.getName() + ".yacy" + filename; if ((p = urlname.indexOf("?")) > 0) urlname = urlname.substring(0, p); - urlstring = url.toNormalform(); + urlstring = "http://" + address + "/" + host.substring(0, p) + filename; } else { - urlstring = url.toNormalform(); + urlstring = comp.url().toNormalform(); urlname = urlstring; } - descr = urlentry.descr(); + // check bluelist again: filter out all links where any bluelisted word // appear either in url, url's description or search word @@ -2097,7 +2096,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser URL wordURL; if (urlstring.matches(query.urlMask)) { //.* is default if (includeSnippets) { - snippet = snippetCache.retrieveSnippet(url, query.queryHashes, false, 260, 1000); + snippet = snippetCache.retrieveSnippet(comp.url(), query.queryHashes, false, 260, 1000); } else { snippet = null; } @@ -2107,7 +2106,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser prop.put("type_results_" + i + "_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, "stippadd", "url", urlstring) == null) ? 1 : 0); prop.put("type_results_" + i + "_recommend_deletelink", "/yacysearch.html?search=" + formerSearch + "&Enter=Search&count=" + query.wantedResults + "&order=" + ranking.orderString() + "&resource=local&time=3&deleteref=" + urlhash + "&urlmaskfilter=.*"); prop.put("type_results_" + i + "_recommend_recommendlink", "/yacysearch.html?search=" + formerSearch + "&Enter=Search&count=" + query.wantedResults + "&order=" + ranking.orderString() + "&resource=local&time=3&recommendref=" + urlhash + "&urlmaskfilter=.*"); - prop.put("type_results_" + i + "_description", descr); + prop.put("type_results_" + i + "_description", comp.descr()); prop.put("type_results_" + i + "_url", urlstring); prop.put("type_results_" + i + "_urlhash", urlhash); prop.put("type_results_" + i + "_urlhexhash", yacySeed.b64Hash2hexHash(urlhash)); @@ -2196,19 +2195,18 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // determine the url string plasmaCrawlLURLEntry entry = urlPool.loadedURL.load(urlhash, null); if (entry == null) return 0; - - URL url = entry.url(); - if (url == null) return 0; + plasmaCrawlLURLEntry.Components comp = entry.comp(); + if (comp.url() == null) return 0; InputStream resourceContent = null; try { // get the resource content - Object[] resource = snippetCache.getResource(url, fetchOnline, 10000); + Object[] resource = snippetCache.getResource(comp.url(), fetchOnline, 10000); resourceContent = (InputStream) resource[0]; Long resourceContentLength = (Long) resource[1]; // parse the resource - plasmaParserDocument document = snippetCache.parseDocument(url, resourceContentLength.longValue(), resourceContent); + plasmaParserDocument document = snippetCache.parseDocument(comp.url(), resourceContentLength.longValue(), resourceContent); // getting parsed body input stream InputStream docBodyInputStream = document.getText(); diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java index cfe7b1391..584d1ff53 100644 --- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java +++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java @@ -334,7 +334,7 @@ public class plasmaSwitchboardQueue { if (referrerURL == null) { if ((referrerHash == null) || (referrerHash.equals(indexURL.dummyHash))) return null; plasmaCrawlLURLEntry entry = lurls.load(referrerHash, null); - if (entry == null) referrerURL = null; else referrerURL = entry.url(); + if (entry == null) referrerURL = null; else referrerURL = entry.comp().url(); } return referrerURL; } diff --git a/source/de/anomic/plasma/plasmaURLPool.java b/source/de/anomic/plasma/plasmaURLPool.java index d1d4e0940..e02af682f 100644 --- a/source/de/anomic/plasma/plasmaURLPool.java +++ b/source/de/anomic/plasma/plasmaURLPool.java @@ -84,7 +84,7 @@ public class plasmaURLPool { if (ne != null) return ne.url(); } catch (IOException e) {} plasmaCrawlLURLEntry le = loadedURL.load(urlhash, null); - if (le != null) return le.url(); + if (le != null) return le.comp().url(); plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash); if (ee != null) return ee.url(); return null; diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 3690804ea..63bc44184 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -780,7 +780,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { if (ue == null) { urlHashs.add(entry.urlHash()); } else { - url = ue.url(); + url = ue.comp().url(); if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url) == true)) { urlHashs.add(entry.urlHash()); } diff --git a/source/de/anomic/tools/bitfield.java b/source/de/anomic/tools/bitfield.java index 95e3534ca..b417145ab 100644 --- a/source/de/anomic/tools/bitfield.java +++ b/source/de/anomic/tools/bitfield.java @@ -46,7 +46,7 @@ public class bitfield { public bitfield(int bytelength) { this.bb= new byte[bytelength]; - for (int i = 0 ; i < bytelength; i++) bb[i] = (char) 48; + for (int i = 0 ; i < bytelength; i++) bb[i] = 0; } public bitfield(byte[] field) { diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 83d5a8190..5d400530b 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -502,7 +502,9 @@ public final class yacyClient { for (int n = 0; n < results; n++) { // get one single search result urlEntry = urlManager.newEntry((String) result.get("resource" + n), true); - if ((urlEntry == null) || (blacklist.isListed(plasmaURLPattern.BLACKLIST_SEARCH, urlEntry.url()))) { continue; } // block with backlist + if (urlEntry == null) continue; + plasmaCrawlLURLEntry.Components comp = urlEntry.comp(); + if (blacklist.isListed(plasmaURLPattern.BLACKLIST_SEARCH, comp.url())) continue; // block with backlist urlManager.store(urlEntry); urlManager.stack(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2); @@ -510,19 +512,20 @@ public final class yacyClient { final indexEntry entry; if (urlEntry.word() == null) { // the old way to define words - int urlLength = urlEntry.url().toString().length(); - int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length; + int urlLength = comp.url().toNormalform().length(); + int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform()).length; entry = new indexURLEntry( urlEntry.hash(), - urlLength, urlComps, - urlEntry.descr().length(), + urlLength, + urlComps, + comp.descr().length(), urlEntry.wordCount(), 0, 0, 0, 0, 0, 0, urlEntry.size(), urlEntry.moddate().getTime(), System.currentTimeMillis(), - urlEntry.quality(), + 0, urlEntry.language(), urlEntry.doctype(), 0,0, diff --git a/source/yacy.java b/source/yacy.java index 3acdea737..01a4d055e 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -958,7 +958,8 @@ public final class yacy { while (eiter.hasNext()) { try { entry = (plasmaCrawlLURLEntry) eiter.next(); - if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), null); + plasmaCrawlLURLEntry.Components comp = entry.comp(); + if ((entry != null) && (comp.url() != null)) doms.put(comp.url().getHost(), null); } catch (Exception e) { // here a MalformedURLException may occur // just ignore @@ -1068,12 +1069,13 @@ public final class yacy { plasmaCrawlLURLEntry entry; while (eiter.hasNext()) { entry = (plasmaCrawlLURLEntry) eiter.next(); - if ((entry != null) && (entry.url() != null)) { + plasmaCrawlLURLEntry.Components comp = entry.comp(); + if ((entry != null) && (comp.url() != null)) { if (html) { - bos.write(("" + entry.descr() + "
").getBytes("UTF-8")); + bos.write(("" + comp.descr() + "
").getBytes("UTF-8")); bos.write(serverCore.crlf); } else { - bos.write(entry.url().toString().getBytes()); + bos.write(comp.url().toNormalform().getBytes()); bos.write(serverCore.crlf); } } @@ -1128,7 +1130,8 @@ public final class yacy { plasmaCrawlLURLEntry entry; while (eiter.hasNext()) { entry = (plasmaCrawlLURLEntry) eiter.next(); - if ((entry != null) && (entry.url() != null)) { + plasmaCrawlLURLEntry.Components comp = entry.comp(); + if ((entry != null) && (comp.url() != null)) { fsp.put(entry.toRowEntry(), entry.loaddate()); } }