- refactoring of plasmaCrawlLURL.Entry to prepare new Entry format

- added test migration method to migrate the old LURL to a new LURL
the new LURL will be splitted into different tables for each month
this solves several problems:
- the biggest table in YaCy is splitted in different parts and can
  also be managed in filesystems that are limited to 2GB
- the oldest entries can easily be identified, used for re-crawl und
  deleted
- The complete database can be limited to a specific size (as wanted many times)


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2755 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 130cc76927
commit a5dd0d41af

@ -56,7 +56,7 @@ import de.anomic.data.listManager;
import de.anomic.data.bookmarksDB.Tag; import de.anomic.data.bookmarksDB.Tag;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
@ -147,7 +147,7 @@ public class Bookmarks {
bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.getBookmark(urlHash); bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.getBookmark(urlHash);
if (bookmark == null) { if (bookmark == null) {
// try to get the bookmark from the LURL database // try to get the bookmark from the LURL database
plasmaCrawlLURL.Entry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null); plasmaCrawlLURLEntry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null);
plasmaParserDocument document = null; plasmaParserDocument document = null;
if(urlentry != null){ if(urlentry != null){
document = switchboard.snippetCache.retrieveDocument(urlentry.url(), true); document = switchboard.snippetCache.retrieveDocument(urlentry.url(), true);

@ -61,7 +61,7 @@ import de.anomic.index.indexEntry;
import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.plasma.urlPattern.plasmaURLPattern;
@ -218,7 +218,7 @@ public class IndexControl_p {
} }
if (post.containsKey("urlhashdelete")) { if (post.containsKey("urlhashdelete")) {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null); plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
if (entry == null) { if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else { } else {
@ -265,7 +265,7 @@ public class IndexControl_p {
HashMap knownURLs = new HashMap(); HashMap knownURLs = new HashMap();
HashSet unknownURLEntries = new HashSet(); HashSet unknownURLEntries = new HashSet();
indexEntry iEntry; indexEntry iEntry;
plasmaCrawlLURL.Entry lurl; plasmaCrawlLURLEntry lurl;
while (urlIter.hasNext()) { while (urlIter.hasNext()) {
iEntry = (indexEntry) urlIter.next(); iEntry = (indexEntry) urlIter.next();
lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null); lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null);
@ -321,7 +321,7 @@ public class IndexControl_p {
URL url = new URL(urlstring); URL url = new URL(urlstring);
urlhash = indexURL.urlHash(url); urlhash = indexURL.urlHash(url);
prop.put("urlhash", urlhash); prop.put("urlhash", urlhash);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null); plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
if (entry == null) { if (entry == null) {
prop.put("urlstring", "unknown url: " + urlstring); prop.put("urlstring", "unknown url: " + urlstring);
prop.put("urlhash", ""); prop.put("urlhash", "");
@ -335,7 +335,7 @@ public class IndexControl_p {
} }
if (post.containsKey("urlhashsearch")) { if (post.containsKey("urlhashsearch")) {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null); plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
if (entry == null) { if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash); prop.put("result", "No Entry for URL hash " + urlhash);
} else { } else {
@ -351,12 +351,12 @@ public class IndexControl_p {
try { try {
final Iterator entryIt = switchboard.urlPool.loadedURL.entries(true, true, urlhash); final Iterator entryIt = switchboard.urlPool.loadedURL.entries(true, true, urlhash);
StringBuffer result = new StringBuffer("Sequential List of URL-Hashes:<br>"); StringBuffer result = new StringBuffer("Sequential List of URL-Hashes:<br>");
plasmaCrawlLURL.Entry entry; plasmaCrawlLURLEntry entry;
int i = 0; int i = 0;
int rows = 0, cols = 0; int rows = 0, cols = 0;
prop.put("urlhashsimilar", 1); prop.put("urlhashsimilar", 1);
while (entryIt.hasNext() && i < 256) { while (entryIt.hasNext() && i < 256) {
entry = (plasmaCrawlLURL.Entry) entryIt.next(); entry = (plasmaCrawlLURLEntry) entryIt.next();
prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", entry.hash()); prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", entry.hash());
cols++; cols++;
if (cols==8) { if (cols==8) {
@ -403,7 +403,7 @@ public class IndexControl_p {
return prop; return prop;
} }
public static serverObjects genUrlProfile(plasmaSwitchboard switchboard, plasmaCrawlLURL.Entry entry, String urlhash) { public static serverObjects genUrlProfile(plasmaSwitchboard switchboard, plasmaCrawlLURLEntry entry, String urlhash) {
serverObjects prop = new serverObjects(); serverObjects prop = new serverObjects();
if (entry == null) { if (entry == null) {
prop.put("genUrlProfile", 1); prop.put("genUrlProfile", 1);
@ -412,7 +412,7 @@ public class IndexControl_p {
} }
URL url = entry.url(); URL url = entry.url();
String referrer = null; String referrer = null;
plasmaCrawlLURL.Entry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null); plasmaCrawlLURLEntry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null);
if (le == null) { if (le == null) {
referrer = "<unknown>"; referrer = "<unknown>";
} else { } else {
@ -463,7 +463,7 @@ public class IndexControl_p {
while (en.hasNext()) { while (en.hasNext()) {
xi = (indexEntry) en.next(); xi = (indexEntry) en.next();
uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())}; uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())};
plasmaCrawlLURL.Entry le = switchboard.urlPool.loadedURL.load(uh[0], null); plasmaCrawlLURLEntry le = switchboard.urlPool.loadedURL.load(uh[0], null);
if (le == null) { if (le == null) {
tm.put(uh[0], uh); tm.put(uh[0], uh);
} else { } else {

@ -55,13 +55,13 @@ import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.cache.IResourceInfo; import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.crawler.plasmaCrawlerException; import de.anomic.plasma.crawler.plasmaCrawlerException;
import de.anomic.plasma.parser.ParserException; import de.anomic.plasma.parser.ParserException;
import de.anomic.plasma.plasmaCrawlLURL.Entry;
import de.anomic.server.serverFileUtils; import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
@ -107,7 +107,7 @@ public class ViewFile {
String viewMode = post.get("viewMode","sentences"); String viewMode = post.get("viewMode","sentences");
// getting the urlEntry that belongs to the url hash // getting the urlEntry that belongs to the url hash
Entry urlEntry = null; plasmaCrawlLURLEntry urlEntry = null;
urlEntry = sb.urlPool.loadedURL.load(urlHash, null); urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
if (urlEntry == null) { if (urlEntry == null) {
prop.put("error",2); prop.put("error",2);

@ -64,7 +64,7 @@ import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCondenser; import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCodings; import de.anomic.server.serverCodings;
import de.anomic.server.serverCore; import de.anomic.server.serverCore;
@ -358,7 +358,7 @@ public class dir {
try { try {
final URL url = new URL(urlstring); final URL url = new URL(urlstring);
final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes())); final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()));
final plasmaCrawlLURL.Entry newEntry = switchboard.urlPool.loadedURL.newEntry( final plasmaCrawlLURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry(
url, "YaCyShare: " + descr, new Date(), new Date(), url, "YaCyShare: " + descr, new Date(), new Date(),
"AAAAAAAAAAAA", /*referrer*/ "AAAAAAAAAAAA", /*referrer*/
0, /*copycount*/ 0, /*copycount*/

@ -51,7 +51,7 @@ import java.util.Date;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
@ -249,7 +249,7 @@ public final class crawlOrder {
// case where we have already the url loaded; // case where we have already the url loaded;
reason = reasonString; reason = reasonString;
// send lurl-Entry as response // send lurl-Entry as response
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null); plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null);
if (entry == null) { if (entry == null) {
response = "rejected"; response = "rejected";
lurl = ""; lurl = "";

@ -51,7 +51,7 @@ import java.io.IOException;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
@ -124,7 +124,7 @@ public final class crawlReceipt {
prop.put("delay", "3600"); prop.put("delay", "3600");
} else if (result.equals("fill")) { } else if (result.equals("fill")) {
// generating a new loaded URL entry // generating a new loaded URL entry
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.newEntry(propStr, true); plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr, true);
if ((entry == null)||(entry.url()==null)) { if ((entry == null)||(entry.url()==null)) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT for hash " + receivedUrlhash + " from peer " + iam + log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT for hash " + receivedUrlhash + " from peer " + iam +
"\n\tURL properties: "+ propStr); "\n\tURL properties: "+ propStr);

@ -54,7 +54,7 @@ import java.util.Set;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexContainer; import de.anomic.index.indexContainer;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchRankingProfile;
@ -244,10 +244,10 @@ public final class search {
StringBuffer links = new StringBuffer(); StringBuffer links = new StringBuffer();
String resource = ""; String resource = "";
//plasmaIndexEntry pie; //plasmaIndexEntry pie;
plasmaCrawlLURL.Entry urlentry; plasmaCrawlLURLEntry urlentry;
plasmaSnippetCache.Snippet snippet; plasmaSnippetCache.Snippet snippet;
while ((acc.hasMoreElements()) && (i < squery.wantedResults)) { while ((acc.hasMoreElements()) && (i < squery.wantedResults)) {
urlentry = acc.nextElement(); urlentry = (plasmaCrawlLURLEntry) acc.nextElement();
if (includesnippet) { if (includesnippet) {
snippet = sb.snippetCache.retrieveSnippet(urlentry.url(), squery.queryHashes, false, 260, 1000); snippet = sb.snippetCache.retrieveSnippet(urlentry.url(), squery.queryHashes, false, 260, 1000);
} else { } else {

@ -48,7 +48,7 @@
import java.io.IOException; import java.io.IOException;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCore; import de.anomic.server.serverCore;
@ -90,7 +90,7 @@ public final class transferURL {
final int sizeBefore = sb.urlPool.loadedURL.size(); final int sizeBefore = sb.urlPool.loadedURL.size();
// read the urls from the other properties and store // read the urls from the other properties and store
String urls; String urls;
plasmaCrawlLURL.Entry lEntry; plasmaCrawlLURLEntry lEntry;
for (int i = 0; i < urlc; i++) { for (int i = 0; i < urlc; i++) {
serverCore.checkInterruption(); serverCore.checkInterruption();
urls = (String) post.get("url" + i); urls = (String) post.get("url" + i);

@ -57,7 +57,7 @@ import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroMSetTools; import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSearchImages; import de.anomic.plasma.plasmaSearchImages;
import de.anomic.plasma.plasmaSearchPreOrder; import de.anomic.plasma.plasmaSearchPreOrder;
@ -189,7 +189,7 @@ public class yacysearch {
return prop; return prop;
} }
final String recommendHash = post.get("recommendref", ""); // urlhash final String recommendHash = post.get("recommendref", ""); // urlhash
plasmaCrawlLURL.Entry urlentry = sb.urlPool.loadedURL.load(recommendHash, null); plasmaCrawlLURLEntry urlentry = sb.urlPool.loadedURL.load(recommendHash, null);
if (urlentry != null) { if (urlentry != null) {
plasmaParserDocument document = sb.snippetCache.retrieveDocument(urlentry.url(), true); plasmaParserDocument document = sb.snippetCache.retrieveDocument(urlentry.url(), true);
if (document != null) { if (document != null) {

@ -13,6 +13,7 @@ import java.util.Iterator;
import java.util.Random; import java.util.Random;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroFlexSplitTable;
import de.anomic.kelondro.kelondroFlexTable; import de.anomic.kelondro.kelondroFlexTable;
import de.anomic.kelondro.kelondroIndex; import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroNaturalOrder;
@ -186,6 +187,10 @@ public class dbtest {
File tablepath = new File(tablename).getParentFile(); File tablepath = new File(tablename).getParentFile();
table = new kelondroFlexTable(tablepath, new File(tablename).getName(), buffer, preload, testRow, kelondroBase64Order.enhancedCoder); table = new kelondroFlexTable(tablepath, new File(tablename).getName(), buffer, preload, testRow, kelondroBase64Order.enhancedCoder);
} }
if (dbe.equals("kelondroFlexSplitTable")) {
File tablepath = new File(tablename).getParentFile();
table = new kelondroFlexSplitTable(tablepath, new File(tablename).getName(), buffer, preload, testRow, kelondroBase64Order.enhancedCoder);
}
if (dbe.equals("mysql")) { if (dbe.equals("mysql")) {
table = new dbTable("mysql", testRow); table = new dbTable("mysql", testRow);
} }
@ -513,6 +518,10 @@ final class dbTable implements kelondroIndex {
} }
} }
public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
return put(row);
}
public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException { public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException {
try { try {

@ -47,7 +47,7 @@ import de.anomic.yacy.yacySeedDB;
public class indexURL { public class indexURL {
// day formatter for entry export // day formatter for entry export
protected static final SimpleDateFormat shortDayFormatter = new SimpleDateFormat("yyyyMMdd"); public static final SimpleDateFormat shortDayFormatter = new SimpleDateFormat("yyyyMMdd");
// statics for value lengths // statics for value lengths
public static final int urlHashLength = yacySeedDB.commonHashLength; // 12 public static final int urlHashLength = yacySeedDB.commonHashLength; // 12
@ -428,15 +428,6 @@ public class indexURL {
} }
} }
public void store(kelondroRow.Entry entry, boolean cached) throws IOException {
if ((cached) && (urlIndexCache != null))
synchronized (urlIndexCache) {
urlIndexCache.put(entry);
}
else
urlIndexFile.put(entry);
}
public void flushCacheSome() { public void flushCacheSome() {
if (urlIndexCache == null) return; if (urlIndexCache == null) return;
if (urlIndexCache.size() == 0) return; if (urlIndexCache.size() == 0) return;

@ -446,7 +446,7 @@ public class kelondroCollectionIndex {
indexEntry.setCol(idx_col_lastread, kelondroRowCollection.daysSince2000(System.currentTimeMillis())); indexEntry.setCol(idx_col_lastread, kelondroRowCollection.daysSince2000(System.currentTimeMillis()));
indexEntry.setCol(idx_col_lastwrote, kelondroRowCollection.daysSince2000(System.currentTimeMillis())); indexEntry.setCol(idx_col_lastwrote, kelondroRowCollection.daysSince2000(System.currentTimeMillis()));
index.put(indexEntry); index.put(indexEntry);
throw new kelondroException(arrayFile(this.path, this.filenameStub, this.loadfactor, chunksize, partitionnumber, serialnumber).toString(), "array contains wrong row '" + new String(arrayrow.getColBytes(0)) + "', expected is '" + new String(indexrow.getColBytes(idx_col_key)) + "', the row has been fixed"); throw new kelondroException(array.filename, "array contains wrong row '" + new String(arrayrow.getColBytes(0)) + "', expected is '" + new String(indexrow.getColBytes(idx_col_key)) + "', the row has been fixed");
} }
int chunkcountInArray = collection.size(); int chunkcountInArray = collection.size();
if (chunkcountInArray != chunkcount) { if (chunkcountInArray != chunkcount) {

@ -209,6 +209,11 @@ public class kelondroColumn {
public String toString() { public String toString() {
StringBuffer s = new StringBuffer(); StringBuffer s = new StringBuffer();
switch (celltype) { switch (celltype) {
case celltype_undefined:
s.append(nickname);
s.append('-');
s.append(cellwidth);
break;
case celltype_boolean: case celltype_boolean:
s.append("boolean "); s.append("boolean ");
s.append(nickname); s.append(nickname);

@ -27,6 +27,7 @@ package de.anomic.kelondro;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondroIndex { public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondroIndex {
@ -137,6 +138,10 @@ public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondr
return super.get(i); return super.get(i);
} }
public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
return put(row);
}
public synchronized kelondroRow.Entry put(kelondroRow.Entry row) throws IOException { public synchronized kelondroRow.Entry put(kelondroRow.Entry row) throws IOException {
int i = index.geti(row.getColBytes(0)); int i = index.geti(row.getColBytes(0));
if (i < 0) { if (i < 0) {

@ -51,6 +51,7 @@
package de.anomic.kelondro; package de.anomic.kelondro;
import java.io.IOException; import java.io.IOException;
import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
public interface kelondroIndex { public interface kelondroIndex {
@ -60,6 +61,7 @@ public interface kelondroIndex {
public kelondroRow row() throws IOException; public kelondroRow row() throws IOException;
public kelondroRow.Entry get(byte[] key) throws IOException; public kelondroRow.Entry get(byte[] key) throws IOException;
public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException; public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException;
public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException;
public kelondroRow.Entry remove(byte[] key) throws IOException; public kelondroRow.Entry remove(byte[] key) throws IOException;
public Iterator rows(boolean up, boolean rotating, byte[] firstKey) throws IOException; public Iterator rows(boolean up, boolean rotating, byte[] firstKey) throws IOException;
public void close() throws IOException; public void close() throws IOException;

@ -26,6 +26,8 @@
package de.anomic.kelondro; package de.anomic.kelondro;
import java.io.IOException;
import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
import java.util.TreeMap; import java.util.TreeMap;
@ -59,6 +61,10 @@ public class kelondroRAMIndex implements kelondroIndex {
return (kelondroRow.Entry) index.get(key); return (kelondroRow.Entry) index.get(key);
} }
public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
return put(row);
}
public synchronized Entry put(Entry row) { public synchronized Entry put(Entry row) {
return (kelondroRow.Entry) index.put(row.getColBytes(0), row); return (kelondroRow.Entry) index.put(row.getColBytes(0), row);
} }

@ -976,7 +976,7 @@ public class kelondroRecords {
return USAGE.FREEC; return USAGE.FREEC;
} }
private final void dispose(Handle h) throws IOException { private synchronized final void dispose(Handle h) throws IOException {
// delete element with handle h // delete element with handle h
// this element is then connected to the deleted-chain and can be // this element is then connected to the deleted-chain and can be
// re-used change counter // re-used change counter
@ -1052,7 +1052,7 @@ public class kelondroRecords {
if (markedDeleted.contains(h)) { if (markedDeleted.contains(h)) {
// loop detection // loop detection
this.theLogger.severe("KELONDRO WARNING " + this.filename + ": FREE-Queue contains loops"); this.theLogger.severe("KELONDRO WARNING " + this.filename + ": FREE-Queue contains loops");
return markedDeleted; return markedDeleted; // TODO: automatic fix
} }
markedDeleted.add(h); markedDeleted.add(h);
seekp = seekpos(h); seekp = seekpos(h);

@ -25,6 +25,7 @@
package de.anomic.kelondro; package de.anomic.kelondro;
import java.io.IOException; import java.io.IOException;
import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
import java.util.Random; import java.util.Random;
import java.util.TreeSet; import java.util.TreeSet;
@ -76,6 +77,10 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
return entry; return entry;
} }
public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
return put(row);
}
public kelondroRow.Entry put(kelondroRow.Entry entry) { public kelondroRow.Entry put(kelondroRow.Entry entry) {
long handle = profile.startWrite(); long handle = profile.startWrite();
int index = -1; int index = -1;

@ -47,6 +47,7 @@ package de.anomic.kelondro;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
public class kelondroSplittedTree implements kelondroIndex { public class kelondroSplittedTree implements kelondroIndex {
@ -109,6 +110,10 @@ public class kelondroSplittedTree implements kelondroIndex {
return ktfs[partition(key)].get(key); return ktfs[partition(key)].get(key);
} }
public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
return put(row);
}
public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException { public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException {
return ktfs[partition(row.getColBytes(0))].put(row); return ktfs[partition(row.getColBytes(0))].put(row);
} }

@ -50,6 +50,7 @@ import java.io.File;
import java.io.FileReader; import java.io.FileReader;
import java.io.IOException; import java.io.IOException;
import java.io.RandomAccessFile; import java.io.RandomAccessFile;
import java.util.Date;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedList;
@ -404,8 +405,12 @@ public class kelondroTree extends kelondroRecords implements kelondroIndex {
return (lc.equals(childn.handle())); return (lc.equals(childn.handle()));
} }
// Associates the specified value with the specified key in this map public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
return put(row);
}
public kelondroRow.Entry put(kelondroRow.Entry newrow) throws IOException { public kelondroRow.Entry put(kelondroRow.Entry newrow) throws IOException {
// Associates the specified value with the specified key in this map
kelondroRow.Entry result = null; kelondroRow.Entry result = null;
//writeLock.stay(2000, 1000); //writeLock.stay(2000, 1000);
if (newrow.columns() != row().columns()) throw new IllegalArgumentException("put: wrong row length " + newrow.columns() + "; must be " + row().columns()); if (newrow.columns() != row().columns()) throw new IllegalArgumentException("put: wrong row length " + newrow.columns() + "; must be " + row().columns());

@ -9,6 +9,7 @@ import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry; import de.anomic.index.indexEntry;
import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.serverDate; import de.anomic.server.serverDate;
@ -155,7 +156,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
// we need to import the url // we need to import the url
// getting the url entry // getting the url entry
plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.load(urlHash, null); plasmaCrawlLURLEntry urlEntry = this.importUrlDB.load(urlHash, null);
if (urlEntry != null) { if (urlEntry != null) {
/* write it into the home url db */ /* write it into the home url db */

@ -61,14 +61,11 @@ import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.Locale; import java.util.Locale;
import java.util.Properties;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.http.httpc.response; import de.anomic.http.httpc.response;
import de.anomic.index.indexEntry; import de.anomic.index.indexEntry;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroRAMIndex; import de.anomic.kelondro.kelondroRAMIndex;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
@ -78,7 +75,6 @@ import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCodings; import de.anomic.server.serverCodings;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.tools.crypt;
import de.anomic.tools.nxTools; import de.anomic.tools.nxTools;
import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed; import de.anomic.yacy.yacySeed;
@ -95,31 +91,15 @@ public final class plasmaCrawlLURL extends indexURL {
private final LinkedList lcrawlResultStack; // 5 - local index: result of local crawling private final LinkedList lcrawlResultStack; // 5 - local index: result of local crawling
private final LinkedList gcrawlResultStack; // 6 - local index: triggered external private final LinkedList gcrawlResultStack; // 6 - local index: triggered external
//public static Set damagedURLS = Collections.synchronizedSet(new HashSet());
public plasmaCrawlLURL(File cachePath, int bufferkb, long preloadTime, boolean newdb) { public plasmaCrawlLURL(File cachePath, int bufferkb, long preloadTime, boolean newdb) {
super(); super();
kelondroRow rowdef = new kelondroRow(
"String urlhash-" + urlHashLength + ", " + // the url's hash
"String urlstring-" + urlStringLength + ", " + // the url as string
"String urldescr-" + urlDescrLength + ", " + // the description of the url
"Cardinal moddate-" + urlDateLength + " {b64e}, " + // last-modified from the httpd
"Cardinal loaddate-" + urlDateLength + " {b64e}, " + // time when the url was loaded
"String refhash-" + urlHashLength + ", " + // the url's referrer hash
"Cardinal copycount-" + urlCopyCountLength + " {b64e}, " + //
"byte[] flags-" + urlFlagLength + ", " + // flags
"Cardinal quality-" + urlQualityLength + " {b64e}, " + //
"String language-" + urlLanguageLength + ", " + //
"byte[] doctype-" + urlDoctypeLength + ", " + //
"Cardinal size-" + urlSizeLength + " {b64e}, " + // size of file in bytes
"Cardinal wc-" + urlWordCountLength + " {b64e}"); // word count
File cacheFile = new File(cachePath, "urlHash.db"); File cacheFile = new File(cachePath, "urlHash.db");
cacheFile.getParentFile().mkdirs(); cacheFile.getParentFile().mkdirs();
try { try {
urlIndexFile = new kelondroTree(cacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, rowdef); urlIndexFile = new kelondroTree(cacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, plasmaCrawlLURLOldEntry.rowdef);
urlIndexCache = new kelondroRAMIndex(kelondroNaturalOrder.naturalOrder, rowdef); urlIndexCache = new kelondroRAMIndex(kelondroNaturalOrder.naturalOrder, plasmaCrawlLURLOldEntry.rowdef);
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
System.exit(-1); System.exit(-1);
@ -134,19 +114,19 @@ public final class plasmaCrawlLURL extends indexURL {
gcrawlResultStack = new LinkedList(); gcrawlResultStack = new LinkedList();
} }
public synchronized void stack(Entry e, String initiatorHash, String executorHash, int stackType) { public synchronized void stack(plasmaCrawlLURLEntry e, String initiatorHash, String executorHash, int stackType) {
if (e == null) { return; } if (e == null) { return; }
try { try {
if (initiatorHash == null) { initiatorHash = dummyHash; } if (initiatorHash == null) { initiatorHash = dummyHash; }
if (executorHash == null) { executorHash = dummyHash; } if (executorHash == null) { executorHash = dummyHash; }
switch (stackType) { switch (stackType) {
case 0: break; case 0: break;
case 1: externResultStack.add(e.urlHash + initiatorHash + executorHash); break; case 1: externResultStack.add(e.hash() + initiatorHash + executorHash); break;
case 2: searchResultStack.add(e.urlHash + initiatorHash + executorHash); break; case 2: searchResultStack.add(e.hash() + initiatorHash + executorHash); break;
case 3: transfResultStack.add(e.urlHash + initiatorHash + executorHash); break; case 3: transfResultStack.add(e.hash() + initiatorHash + executorHash); break;
case 4: proxyResultStack.add(e.urlHash + initiatorHash + executorHash); break; case 4: proxyResultStack.add(e.hash() + initiatorHash + executorHash); break;
case 5: lcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break; case 5: lcrawlResultStack.add(e.hash() + initiatorHash + executorHash); break;
case 6: gcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break; case 6: gcrawlResultStack.add(e.hash() + initiatorHash + executorHash); break;
} }
return; return;
} catch (Exception ex) { } catch (Exception ex) {
@ -159,7 +139,7 @@ public final class plasmaCrawlLURL extends indexURL {
gcrawlResultStack.add(urlHash + initiatorHash + executorHash); gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
} }
public Entry load(String urlHash, indexEntry searchedWord) { public plasmaCrawlLURLEntry load(String urlHash, indexEntry searchedWord) {
// generates an plasmaLURLEntry using the url hash // generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered // to speed up the access, the url-hashes are buffered
// in the hash cache. // in the hash cache.
@ -171,19 +151,18 @@ public final class plasmaCrawlLURL extends indexURL {
try { try {
if (entry == null) entry = urlIndexFile.get(urlHash.getBytes()); if (entry == null) entry = urlIndexFile.get(urlHash.getBytes());
if (entry == null) return null; if (entry == null) return null;
return new Entry(entry, searchedWord); return new plasmaCrawlLURLOldEntry(entry, searchedWord);
} catch (IOException e) { } catch (IOException e) {
return null; return null;
} }
} }
public void store(Entry entry, boolean cached) throws IOException { public void store(plasmaCrawlLURLEntry entry, boolean cached) throws IOException {
// Check if there is a more recent Entry already in the DB // Check if there is a more recent Entry already in the DB
if (entry.stored) return; plasmaCrawlLURLEntry oldEntry;
Entry oldEntry;
try { try {
if (exists(entry.urlHash)) { if (exists(entry.hash())) {
oldEntry = load(entry.urlHash, null); oldEntry = load(entry.hash(), null);
} else { } else {
oldEntry = null; oldEntry = null;
} }
@ -194,40 +173,32 @@ public final class plasmaCrawlLURL extends indexURL {
// the fetched oldEntry is better, so return its properties instead of the new ones // the fetched oldEntry is better, so return its properties instead of the new ones
// this.urlHash = oldEntry.urlHash; // unnecessary, should be the same // this.urlHash = oldEntry.urlHash; // unnecessary, should be the same
// this.url = oldEntry.url; // unnecessary, should be the same // this.url = oldEntry.url; // unnecessary, should be the same
entry.descr = oldEntry.descr; entry = oldEntry;
entry.moddate = oldEntry.moddate;
entry.loaddate = oldEntry.loaddate;
entry.referrerHash = oldEntry.referrerHash;
entry.copyCount = oldEntry.copyCount;
entry.flags = oldEntry.flags;
entry.quality = oldEntry.quality;
entry.language = oldEntry.language;
entry.doctype = oldEntry.doctype;
entry.size = oldEntry.size;
entry.wordCount = oldEntry.wordCount;
// this.snippet // not read from db
// this.word // not read from db
entry.stored = true;
return; // this did not need to be stored, but is updated return; // this did not need to be stored, but is updated
} }
super.store(entry.toRowEntry(), cached); if ((cached) && (urlIndexCache != null)) {
entry.stored = true; synchronized (urlIndexCache) {
urlIndexCache.put(entry.toRowEntry());
}
} else {
urlIndexFile.put(entry.toRowEntry(), entry.loaddate());
}
} }
public synchronized Entry newEntry(String propStr, boolean setGlobal) { public synchronized plasmaCrawlLURLEntry newEntry(String propStr, boolean setGlobal) {
if (propStr.startsWith("{") && propStr.endsWith("}")) { if (propStr.startsWith("{") && propStr.endsWith("}")) {
return new Entry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)), setGlobal); return new plasmaCrawlLURLOldEntry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)), setGlobal);
} else { } else {
return null; return null;
} }
} }
public synchronized Entry newEntry(URL url, String descr, Date moddate, Date loaddate, public synchronized plasmaCrawlLURLEntry newEntry(URL url, String descr, Date moddate, Date loaddate,
String referrerHash, int copyCount, boolean localNeed, String referrerHash, int copyCount, boolean localNeed,
int quality, String language, char doctype, int quality, String language, char doctype,
int size, int wordCount) { int size, int wordCount) {
Entry e = new Entry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount); plasmaCrawlLURLEntry e = new plasmaCrawlLURLOldEntry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount);
return e; return e;
} }
@ -365,7 +336,7 @@ public final class plasmaCrawlLURL extends indexURL {
String urlHash, initiatorHash, executorHash; String urlHash, initiatorHash, executorHash;
String cachepath, urlstr, urltxt; String cachepath, urlstr, urltxt;
yacySeed initiatorSeed, executorSeed; yacySeed initiatorSeed, executorSeed;
plasmaCrawlLURL.Entry urle; plasmaCrawlLURLEntry urle;
URL url; URL url;
// needed for getCachePath(url) // needed for getCachePath(url)
@ -412,317 +383,6 @@ public final class plasmaCrawlLURL extends indexURL {
return prop; return prop;
} }
public class Entry {
private URL url;
private String descr;
private Date moddate;
private Date loaddate;
private String urlHash;
private String referrerHash;
private int copyCount;
private String flags;
private int quality;
private String language;
private char doctype;
private int size;
private int wordCount;
private String snippet;
private indexEntry word; // this is only used if the url is transported via remote search requests
private boolean stored;
// more needed attributes:
// - author / copyright owner
// - keywords
// - phrasecount, total number of phrases
// - boolean: URL attributes (see Word-Entity definition)
// - boolean: appearance of bold and/or italics
// - ETag: for re-crawl decision upon HEAD request
// - int: # of outlinks to same domain
// - int: # of outlinks to outside domain
// - int: # of keywords
// - int: # der auf der Seite vorhandenen Links zu image, audio, video, applications
public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) {
// create new entry and store it into database
this.urlHash = urlHash(url);
this.url = url;
this.descr = (descr == null) ? this.url.toString() : descr;
this.moddate = moddate;
this.loaddate = loaddate;
this.referrerHash = (referrerHash == null) ? dummyHash : referrerHash;
this.copyCount = copyCount; // the number of remote (global) copies of this object without this one
this.flags = (localNeed) ? "L " : " ";
this.quality = quality;
this.language = (language == null) ? "uk" : language;
this.doctype = doctype;
this.size = size;
this.wordCount = wordCount;
this.snippet = null;
this.word = null;
this.stored = false;
}
public Entry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException {
try {
this.urlHash = entry.getColString(0, null);
this.url = new URL(entry.getColString(1, "UTF-8").trim());
this.descr = (entry.empty(2)) ? this.url.toString() : entry.getColString(2, "UTF-8").trim();
this.moddate = new Date(86400000 * entry.getColLong(3));
this.loaddate = new Date(86400000 * entry.getColLong(4));
this.referrerHash = (entry.empty(5)) ? dummyHash : entry.getColString(5, "UTF-8");
this.copyCount = (int) entry.getColLong(6);
this.flags = entry.getColString(7, "UTF-8");
this.quality = (int) entry.getColLong(8);
this.language = entry.getColString(9, "UTF-8");
this.doctype = (char) entry.getColByte(10);
this.size = (int) entry.getColLong(11);
this.wordCount = (int) entry.getColLong(12);
this.snippet = null;
this.word = searchedWord;
this.stored = false;
return;
} catch (Exception e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/1: " + e.toString(), e);
throw new IOException("plasmaLURL.entry/1: " + e.toString());
}
}
public Entry(Properties prop, boolean setGlobal) {
// generates an plasmaLURLEntry using the properties from the argument
// the property names must correspond to the one from toString
//System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
this.urlHash = prop.getProperty("hash", dummyHash);
try {
//byte[][] entry = urlHashCache.get(urlHash.getBytes());
//if (entry == null) {
this.referrerHash = prop.getProperty("referrer", dummyHash);
this.moddate = shortDayFormatter.parse(prop.getProperty("mod", "20000101"));
//System.out.println("DEBUG: moddate = " + moddate + ", prop=" + prop.getProperty("mod"));
this.loaddate = shortDayFormatter.parse(prop.getProperty("load", "20000101"));
this.copyCount = Integer.parseInt(prop.getProperty("cc", "0"));
this.flags = ((prop.getProperty("local", "true").equals("true")) ? "L " : " ");
if (setGlobal) this.flags = "G ";
this.url = new URL(crypt.simpleDecode(prop.getProperty("url", ""), null));
this.descr = crypt.simpleDecode(prop.getProperty("descr", ""), null);
if (this.descr == null) this.descr = this.url.toString();
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(prop.getProperty("q", ""));
this.language = prop.getProperty("lang", "uk");
this.doctype = prop.getProperty("dt", "t").charAt(0);
this.size = Integer.parseInt(prop.getProperty("size", "0"));
this.wordCount = Integer.parseInt(prop.getProperty("wc", "0"));
this.snippet = prop.getProperty("snippet", "");
if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null);
this.word = (prop.containsKey("word")) ? new indexURLEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null;
this.stored = false;
//}
} catch (Exception e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/2:" +
"\nProperties: " + ((prop==null)?null:prop.toString()) +
((prop.containsKey("word")) ? "\nWord: " + kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word","")) : "") +
"\nErrorMsg: " + e.toString(), e);
}
}
public kelondroRow.Entry toRowEntry() throws IOException {
final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, urlDateLength);
final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength);
final byte[][] entry = new byte[][] {
urlHash.getBytes(),
url.toString().getBytes(),
descr.getBytes(), // null?
moddatestr.getBytes(),
loaddatestr.getBytes(),
referrerHash.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(copyCount, urlCopyCountLength).getBytes(),
flags.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(quality, urlQualityLength).getBytes(),
language.getBytes(),
new byte[] {(byte) doctype},
kelondroBase64Order.enhancedCoder.encodeLong(size, urlSizeLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(wordCount, urlWordCountLength).getBytes(),
};
return urlIndexFile.row().newEntry(entry);
}
public String hash() {
// return a url-hash, based on the md5 algorithm
// the result is a String of 12 bytes within a 72-bit space
// (each byte has an 6-bit range)
// that should be enough for all web pages on the world
return this.urlHash;
}
public URL url() {
return url;
}
public String descr() {
return descr;
}
public Date moddate() {
return moddate;
}
public Date loaddate() {
return loaddate;
}
public String referrerHash() {
// return the creator's hash
return referrerHash;
}
public char doctype() {
return doctype;
}
public int copyCount() {
// return number of copies of this object in the global index
return copyCount;
}
public boolean local() {
// returns true if the url was created locally and is needed for own word index
if (flags == null) return false;
return flags.charAt(0) == 'L';
}
public int quality() {
return quality;
}
public String language() {
return language;
}
public int size() {
return size;
}
public int wordCount() {
return wordCount;
}
public String snippet() {
// the snippet may appear here if the url was transported in a remote search
// it will not be saved anywhere, but can only be requested here
return snippet;
}
public indexEntry word() {
return word;
}
public boolean isOlder (Entry other) {
if (other == null) return false;
if (moddate.before(other.moddate())) return true;
if (moddate.equals(other.moddate())) {
if (loaddate.before(other.loaddate())) return true;
if (loaddate.equals(other.loaddate())) {
if (quality < other.quality()) return true;
}
}
return false;
}
private StringBuffer corePropList() {
// generate a parseable string; this is a simple property-list
final StringBuffer corePropStr = new StringBuffer(300);
try {
corePropStr
.append("hash=") .append(urlHash)
.append(",referrer=").append(referrerHash)
.append(",mod=") .append(shortDayFormatter.format(moddate))
.append(",load=") .append(shortDayFormatter.format(loaddate))
.append(",size=") .append(size)
.append(",wc=") .append(wordCount)
.append(",cc=") .append(copyCount)
.append(",local=") .append(((local()) ? "true" : "false"))
.append(",q=") .append(kelondroBase64Order.enhancedCoder.encodeLong(quality, urlQualityLength))
.append(",dt=") .append(doctype)
.append(",lang=") .append(language)
.append(",url=") .append(crypt.simpleEncode(url.toString()))
.append(",descr=") .append(crypt.simpleEncode(descr));
if (this.word != null) {
// append also word properties
corePropStr.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm(false)));
}
return corePropStr;
} catch (Exception e) {
// serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
// if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
// if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null");
// e.printStackTrace();
return null;
}
}
/*
public String toString(int posintext, int posinphrase, int posofphrase) {
// add information needed for remote transport
final StringBuffer core = corePropList();
if (core == null) return null;
core.ensureCapacity(core.length() + 200);
core.insert(0,"{")
.append(",posintext=").append(posintext)
.append(",posinphrase=").append(posinphrase)
.append(",posofphraseint=").append(posofphrase)
.append("}");
return core.toString();
}
*/
public String toString(String snippet) {
// add information needed for remote transport
final StringBuffer core = corePropList();
if (core == null) return null;
core.ensureCapacity(core.length() + snippet.length()*2);
core.insert(0,"{");
core.append(",snippet=").append(crypt.simpleEncode(snippet));
core.append("}");
return core.toString();
//return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
}
/**
* Returns this object as String.<br>
* This e.g. looks like this:
* <pre>{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}</pre>
*/
public String toString() {
final StringBuffer core = corePropList();
if (core == null) return null;
core.insert(0,"{");
core.append("}");
return core.toString();
//return "{" + core + "}";
}
public void print() {
System.out.println("URL : " + url);
System.out.println("Description : " + descr);
System.out.println("Modified : " + httpc.dateString(moddate));
System.out.println("Loaded : " + httpc.dateString(loaddate));
System.out.println("Size : " + size + " bytes, " + wordCount + " words");
System.out.println("Referrer Hash : " + referrerHash);
System.out.println("Quality : " + quality);
System.out.println("Language : " + language);
System.out.println("DocType : " + doctype);
System.out.println();
}
} // class Entry
public class kiter implements Iterator { public class kiter implements Iterator {
// enumerates entry elements // enumerates entry elements
Iterator i; Iterator i;
@ -742,7 +402,7 @@ public final class plasmaCrawlLURL extends indexURL {
kelondroRow.Entry e = (kelondroRow.Entry) i.next(); kelondroRow.Entry e = (kelondroRow.Entry) i.next();
if (e == null) return null; if (e == null) return null;
try { try {
return new Entry(e, null); return new plasmaCrawlLURLOldEntry(e, null);
} catch (IOException ex) { } catch (IOException ex) {
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null)); throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null));
} }
@ -873,7 +533,7 @@ public final class plasmaCrawlLURL extends indexURL {
} }
} }
plasmaCrawlLURL.Entry entry = (plasmaCrawlLURL.Entry) eiter.next(); plasmaCrawlLURLEntry entry = (plasmaCrawlLURLEntry) eiter.next();
totalSearchedUrls++; totalSearchedUrls++;
if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, entry.url()) || if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, entry.url()) ||
plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, entry.url())) { plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, entry.url())) {
@ -944,7 +604,7 @@ public final class plasmaCrawlLURL extends indexURL {
final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), 1, 0, false); final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), 1, 0, false);
final Iterator enu = urls.entries(true, false, null); final Iterator enu = urls.entries(true, false, null);
while (enu.hasNext()) { while (enu.hasNext()) {
((Entry) enu.next()).print(); ((plasmaCrawlLURLEntry) enu.next()).print();
} }
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();

@ -385,7 +385,7 @@ public final class plasmaCrawlStacker {
checkInterruption(); checkInterruption();
String nexturlhash = indexURL.urlHash(nexturl); String nexturlhash = indexURL.urlHash(nexturl);
String dbocc = this.sb.urlPool.exists(nexturlhash); String dbocc = this.sb.urlPool.exists(nexturlhash);
plasmaCrawlLURL.Entry oldEntry = null; plasmaCrawlLURLEntry oldEntry = null;
oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null); oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null);
boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder()); boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());
if ((dbocc != null) && (!(recrawl))) { if ((dbocc != null) && (!(recrawl))) {

@ -199,7 +199,7 @@ public class plasmaDHTChunk {
indexContainer container; indexContainer container;
Iterator urlIter; Iterator urlIter;
indexEntry iEntry; indexEntry iEntry;
plasmaCrawlLURL.Entry lurl; plasmaCrawlLURLEntry lurl;
int refcount = 0; int refcount = 0;
int wholesize; int wholesize;
@ -281,11 +281,11 @@ public class plasmaDHTChunk {
} }
public synchronized int deleteTransferIndexes() { public synchronized String deleteTransferIndexes() {
Iterator urlIter; Iterator urlIter;
indexEntry iEntry; indexEntry iEntry;
HashSet urlHashes; HashSet urlHashes;
int count = 0; String count = "0";
for (int i = 0; i < this.indexContainers.length; i++) { for (int i = 0; i < this.indexContainers.length; i++) {
// delete entries separately // delete entries separately
@ -301,7 +301,7 @@ public class plasmaDHTChunk {
urlHashes.add(iEntry.urlHash()); urlHashes.add(iEntry.urlHash());
} }
String wordHash = indexContainers[i].getWordHash(); String wordHash = indexContainers[i].getWordHash();
count += wordIndex.removeEntries(this.indexContainers[i].getWordHash(), urlHashes, true); count = wordIndex.removeEntriesExpl(this.indexContainers[i].getWordHash(), urlHashes, true);
if (log.isFine()) if (log.isFine())
log.logFine("Deleted partial index (" + c + " URLs) for word " + wordHash + "; " + this.wordIndex.indexSize(wordHash) + " entries left"); log.logFine("Deleted partial index (" + c + " URLs) for word " + wordHash + "; " + this.wordIndex.indexSize(wordHash) + " entries left");
this.indexContainers[i] = null; this.indexContainers[i] = null;

@ -222,7 +222,7 @@ public class plasmaDHTFlush extends Thread {
// deleting transfered words from index // deleting transfered words from index
if (this.delete) { if (this.delete) {
this.status = "Running: Deleting chunk " + iteration; this.status = "Running: Deleting chunk " + iteration;
int urlReferences = oldDHTChunk.deleteTransferIndexes(); String urlReferences = oldDHTChunk.deleteTransferIndexes();
this.log.logFine("Deleted from " + oldDHTChunk.containerSize() + " transferred RWIs locally " + urlReferences + " URL references"); this.log.logFine("Deleted from " + oldDHTChunk.containerSize() + " transferred RWIs locally " + urlReferences + " URL references");
} }
oldDHTChunk = null; oldDHTChunk = null;

@ -370,7 +370,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
//if (searchResult.size() == 0) return acc; // case that we have nothing to do //if (searchResult.size() == 0) return acc; // case that we have nothing to do
indexEntry entry; indexEntry entry;
plasmaCrawlLURL.Entry page; plasmaCrawlLURLEntry page;
Long preranking; Long preranking;
Object[] preorderEntry; Object[] preorderEntry;
int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT); int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT);

@ -101,7 +101,7 @@ public final class plasmaSearchImages {
public plasmaSearchImages(plasmaSnippetCache sc, long maxTime, plasmaSearchResult sres, int depth) { public plasmaSearchImages(plasmaSnippetCache sc, long maxTime, plasmaSearchResult sres, int depth) {
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
this.images = new TreeSet(); this.images = new TreeSet();
plasmaCrawlLURL.Entry urlentry; plasmaCrawlLURLEntry urlentry;
while (sres.hasMoreElements()) { while (sres.hasMoreElements()) {
urlentry = sres.nextElement(); urlentry = sres.nextElement();
addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), urlentry.url(), depth)); addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), urlentry.url(), depth));

@ -185,7 +185,13 @@ public final class plasmaSearchPreOrder {
public Object[] /*{indexEntry, Long}*/ next() { public Object[] /*{indexEntry, Long}*/ next() {
String top = (String) pageAcc.firstKey(); String top = (String) pageAcc.firstKey();
//System.out.println("preorder-key: " + top); //System.out.println("preorder-key: " + top);
Long preranking = new Long(Long.MAX_VALUE - Long.parseLong(top.substring(0, 16), 16)); // java.lang.NumberFormatException: For input string: "8000000000020b17" ??? Long preranking;
try {
preranking = new Long(Long.MAX_VALUE - Long.parseLong(top.substring(0, 16), 16)); // java.lang.NumberFormatException: For input string: "8000000000020b17" ???
} catch (NumberFormatException e) {
e.printStackTrace();
preranking = new Long(0);
}
return new Object[]{(indexEntry) pageAcc.remove(top), preranking}; return new Object[]{(indexEntry) pageAcc.remove(top), preranking};
} }

@ -191,7 +191,7 @@ public class plasmaSearchRankingProfile {
Set topwords, Set topwords,
String[] urlcomps, String[] urlcomps,
String[] descrcomps, String[] descrcomps,
plasmaCrawlLURL.Entry page) { plasmaCrawlLURLEntry page) {
// apply pre-calculated order attributes // apply pre-calculated order attributes
long ranking = preranking; long ranking = preranking;

@ -99,13 +99,13 @@ public final class plasmaSearchResult {
return pageAcc.size() > 0; return pageAcc.size() > 0;
} }
public plasmaCrawlLURL.Entry nextElement() { public plasmaCrawlLURLEntry nextElement() {
Object top = pageAcc.firstKey(); Object top = pageAcc.firstKey();
//System.out.println("postorder-key: " + ((String) top)); //System.out.println("postorder-key: " + ((String) top));
return (plasmaCrawlLURL.Entry) pageAcc.remove(top); return (plasmaCrawlLURLEntry) pageAcc.remove(top);
} }
protected void addResult(plasmaCrawlLURL.Entry page, Long preranking) { protected void addResult(plasmaCrawlLURLEntry page, Long preranking) {
// take out relevant information for reference computation // take out relevant information for reference computation
URL url = page.url(); URL url = page.url();
@ -132,12 +132,12 @@ public final class plasmaSearchResult {
for (int i = 0; i < references.length; i++) commonSense.add(references[i]); for (int i = 0; i < references.length; i++) commonSense.add(references[i]);
Object[] resultVector; Object[] resultVector;
plasmaCrawlLURL.Entry page; plasmaCrawlLURLEntry page;
long ranking; long ranking;
for (int i = 0; i < results.size(); i++) { for (int i = 0; i < results.size(); i++) {
// take out values from result array // take out values from result array
resultVector = (Object[]) results.get(i); resultVector = (Object[]) results.get(i);
page = (plasmaCrawlLURL.Entry) resultVector[0]; page = (plasmaCrawlLURLEntry) resultVector[0];
// calculate ranking // calculate ranking
if (postsort) if (postsort)
@ -173,7 +173,7 @@ public final class plasmaSearchResult {
// first scan all entries and find all urls that are referenced // first scan all entries and find all urls that are referenced
while (i.hasNext()) { while (i.hasNext()) {
entry = (Map.Entry) i.next(); entry = (Map.Entry) i.next();
path = urlPath(((plasmaCrawlLURL.Entry) entry.getValue()).url()); path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).url());
paths.put(path, entry.getKey()); paths.put(path, entry.getKey());
//if (path != null) path = shortenPath(path); //if (path != null) path = shortenPath(path);
//if (path != null) paths.put(path, entry.getKey()); //if (path != null) paths.put(path, entry.getKey());
@ -184,7 +184,7 @@ public final class plasmaSearchResult {
String shorten; String shorten;
while (i.hasNext()) { while (i.hasNext()) {
entry = (Map.Entry) i.next(); entry = (Map.Entry) i.next();
path = urlPath(((plasmaCrawlLURL.Entry) entry.getValue()).url()); path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).url());
shorten = shortenPath(path); shorten = shortenPath(path);
// scan all subpaths of the url // scan all subpaths of the url
while (shorten != null) { while (shorten != null) {

@ -629,7 +629,7 @@ public class plasmaSnippetCache {
public void fetch(plasmaSearchResult acc, Set queryhashes, String urlmask, int fetchcount, long maxTime) { public void fetch(plasmaSearchResult acc, Set queryhashes, String urlmask, int fetchcount, long maxTime) {
// fetch snippets // fetch snippets
int i = 0; int i = 0;
plasmaCrawlLURL.Entry urlentry; plasmaCrawlLURLEntry urlentry;
String urlstring; String urlstring;
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) { while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) {

@ -1011,7 +1011,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// possibly delete entries from last chunk // possibly delete entries from last chunk
if ((this.dhtTransferChunk != null) && if ((this.dhtTransferChunk != null) &&
(this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_COMPLETE)) { (this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_COMPLETE)) {
int deletedURLs = this.dhtTransferChunk.deleteTransferIndexes(); String deletedURLs = this.dhtTransferChunk.deleteTransferIndexes();
this.log.logFine("Deleted from " + this.dhtTransferChunk.containers().length + " transferred RWIs locally, removed " + deletedURLs + " URL references"); this.log.logFine("Deleted from " + this.dhtTransferChunk.containers().length + " transferred RWIs locally, removed " + deletedURLs + " URL references");
this.dhtTransferChunk = null; this.dhtTransferChunk = null;
} }
@ -1556,7 +1556,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption(); checkInterruption();
// create a new loaded URL db entry // create a new loaded URL db entry
plasmaCrawlLURL.Entry newEntry = urlPool.loadedURL.newEntry( plasmaCrawlLURLEntry newEntry = urlPool.loadedURL.newEntry(
entry.url(), // URL entry.url(), // URL
docDescription, // document description docDescription, // document description
docDate, // modification date docDate, // modification date
@ -1965,7 +1965,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String lurl = (String) page.get("lurl"); String lurl = (String) page.get("lurl");
if ((lurl != null) && (lurl.length() != 0)) { if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key")); String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.newEntry(propStr, true); plasmaCrawlLURLEntry entry = urlPool.loadedURL.newEntry(propStr, true);
urlPool.loadedURL.store(entry, false); urlPool.loadedURL.store(entry, false);
urlPool.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt? urlPool.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
urlPool.noticeURL.remove(entry.hash()); urlPool.noticeURL.remove(entry.hash());
@ -2045,7 +2045,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
int i = 0; int i = 0;
int p; int p;
URL url; URL url;
plasmaCrawlLURL.Entry urlentry; plasmaCrawlLURLEntry urlentry;
String urlstring, urlname, filename, urlhash; String urlstring, urlname, filename, urlhash;
String host, hash, address, descr = ""; String host, hash, address, descr = "";
yacySeed seed; yacySeed seed;
@ -2192,7 +2192,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// finally, delete the url entry // finally, delete the url entry
// determine the url string // determine the url string
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.load(urlhash, null); plasmaCrawlLURLEntry entry = urlPool.loadedURL.load(urlhash, null);
if (entry == null) return 0; if (entry == null) return 0;
URL url = entry.url(); URL url = entry.url();

@ -333,7 +333,7 @@ public class plasmaSwitchboardQueue {
public URL referrerURL() { public URL referrerURL() {
if (referrerURL == null) { if (referrerURL == null) {
if ((referrerHash == null) || (referrerHash.equals(indexURL.dummyHash))) return null; if ((referrerHash == null) || (referrerHash.equals(indexURL.dummyHash))) return null;
plasmaCrawlLURL.Entry entry = lurls.load(referrerHash, null); plasmaCrawlLURLEntry entry = lurls.load(referrerHash, null);
if (entry == null) referrerURL = null; else referrerURL = entry.url(); if (entry == null) referrerURL = null; else referrerURL = entry.url();
} }
return referrerURL; return referrerURL;

@ -83,7 +83,7 @@ public class plasmaURLPool {
plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash); plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
if (ne != null) return ne.url(); if (ne != null) return ne.url();
} catch (IOException e) {} } catch (IOException e) {}
plasmaCrawlLURL.Entry le = loadedURL.load(urlhash, null); plasmaCrawlLURLEntry le = loadedURL.load(urlhash, null);
if (le != null) return le.url(); if (le != null) return le.url();
plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash); plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);
if (ee != null) return ee.url(); if (ee != null) return ee.url();

@ -484,13 +484,25 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
int removed = 0; int removed = 0;
removed += dhtInCache.removeEntries(wordHash, urlHashes, deleteComplete); removed += dhtInCache.removeEntries(wordHash, urlHashes, deleteComplete);
removed += dhtOutCache.removeEntries(wordHash, urlHashes, deleteComplete); removed += dhtOutCache.removeEntries(wordHash, urlHashes, deleteComplete);
if (removed == urlHashes.size()) return removed; //if (removed == urlHashes.size()) return removed;
if (useCollectionIndex) { if (useCollectionIndex) {
removed += collections.removeEntries(wordHash, urlHashes, deleteComplete); removed += collections.removeEntries(wordHash, urlHashes, deleteComplete);
if (removed == urlHashes.size()) return removed; //if (removed == urlHashes.size()) return removed;
} }
removed += assortmentCluster.removeEntries(wordHash, urlHashes, deleteComplete); removed += assortmentCluster.removeEntries(wordHash, urlHashes, deleteComplete);
if (removed == urlHashes.size()) return removed; //if (removed == urlHashes.size()) return removed;
removed += backend.removeEntries(wordHash, urlHashes, deleteComplete);
return removed;
}
public String removeEntriesExpl(String wordHash, Set urlHashes, boolean deleteComplete) {
String removed = "";
removed += dhtInCache.removeEntries(wordHash, urlHashes, deleteComplete) + ", ";
removed += dhtOutCache.removeEntries(wordHash, urlHashes, deleteComplete) + ", ";
if (useCollectionIndex) {
removed += collections.removeEntries(wordHash, urlHashes, deleteComplete) + ", ";
} else removed += "0, ";
removed += assortmentCluster.removeEntries(wordHash, urlHashes, deleteComplete) + ", ";
removed += backend.removeEntries(wordHash, urlHashes, deleteComplete); removed += backend.removeEntries(wordHash, urlHashes, deleteComplete);
return removed; return removed;
} }
@ -772,7 +784,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
waiter(); waiter();
entry = (indexEntry) containerIterator.next(); entry = (indexEntry) containerIterator.next();
// System.out.println("Wordhash: "+wordHash+" UrlHash: "+entry.getUrlHash()); // System.out.println("Wordhash: "+wordHash+" UrlHash: "+entry.getUrlHash());
plasmaCrawlLURL.Entry ue = lurl.load(entry.urlHash(), null); plasmaCrawlLURLEntry ue = lurl.load(entry.urlHash(), null);
if (ue == null) { if (ue == null) {
urlHashs.add(entry.urlHash()); urlHashs.add(entry.urlHash());
} else { } else {

@ -62,6 +62,7 @@ import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSearchTimingProfile; import de.anomic.plasma.plasmaSearchTimingProfile;
import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSnippetCache;
@ -496,7 +497,7 @@ public final class yacyClient {
} }
// insert results to containers // insert results to containers
plasmaCrawlLURL.Entry urlEntry; plasmaCrawlLURLEntry urlEntry;
String[] urls = new String[results]; String[] urls = new String[results];
for (int n = 0; n < results; n++) { for (int n = 0; n < results; n++) {
// get one single search result // get one single search result
@ -862,7 +863,7 @@ public final class yacyClient {
-er crawlt, Ergebnis erscheint aber unter falschem initiator -er crawlt, Ergebnis erscheint aber unter falschem initiator
*/ */
public static HashMap crawlReceipt(yacySeed targetSeed, String process, String result, String reason, plasmaCrawlLURL.Entry entry, String wordhashes) { public static HashMap crawlReceipt(yacySeed targetSeed, String process, String result, String reason, plasmaCrawlLURLEntry entry, String wordhashes) {
if (targetSeed == null) { return null; } if (targetSeed == null) { return null; }
if (yacyCore.seedDB.mySeed == null) { return null; } if (yacyCore.seedDB.mySeed == null) { return null; }
if (yacyCore.seedDB.mySeed == targetSeed) { return null; } if (yacyCore.seedDB.mySeed == targetSeed) { return null; }
@ -981,9 +982,9 @@ public final class yacyClient {
if (uhs.length == 0) { return resultObj; } // all url's known if (uhs.length == 0) { return resultObj; } // all url's known
// extract the urlCache from the result // extract the urlCache from the result
plasmaCrawlLURL.Entry[] urls = new plasmaCrawlLURL.Entry[uhs.length]; plasmaCrawlLURLEntry[] urls = new plasmaCrawlLURLEntry[uhs.length];
for (int i = 0; i < uhs.length; i++) { for (int i = 0; i < uhs.length; i++) {
urls[i] = (plasmaCrawlLURL.Entry) urlCache.get(uhs[i]); urls[i] = (plasmaCrawlLURLEntry) urlCache.get(uhs[i]);
if (urls[i] == null) { if (urls[i] == null) {
yacyCore.log.logFine("DEBUG transferIndex: requested url hash '" + uhs[i] + "', unknownURL='" + uhss + "'"); yacyCore.log.logFine("DEBUG transferIndex: requested url hash '" + uhs[i] + "', unknownURL='" + uhss + "'");
} }
@ -1092,7 +1093,7 @@ public final class yacyClient {
} }
} }
private static HashMap transferURL(yacySeed targetSeed, plasmaCrawlLURL.Entry[] urls, boolean gzipBody, int timeout) { private static HashMap transferURL(yacySeed targetSeed, plasmaCrawlLURLEntry[] urls, boolean gzipBody, int timeout) {
// this post a message to the remote message board // this post a message to the remote message board
final String address = targetSeed.getAddress(); final String address = targetSeed.getAddress();
if (address == null) { return null; } if (address == null) { return null; }

@ -75,11 +75,15 @@ import de.anomic.index.indexEntry;
import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroDyn; import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroFlexSplitTable;
import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroMap; import de.anomic.kelondro.kelondroMap;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaCrawlLURLOldEntry;
import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURLPool; import de.anomic.plasma.plasmaURLPool;
@ -730,7 +734,7 @@ public final class yacy {
iEntry = (indexEntry) wordIdxEntries.next(); iEntry = (indexEntry) wordIdxEntries.next();
String urlHash = iEntry.urlHash(); String urlHash = iEntry.urlHash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try { if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
plasmaCrawlLURL.Entry urlEntry = currentUrlDB.load(urlHash, null); plasmaCrawlLURLEntry urlEntry = currentUrlDB.load(urlHash, null);
urlCounter++; urlCounter++;
minimizedUrlDB.store(urlEntry, false); minimizedUrlDB.store(urlEntry, false);
if (urlCounter % 500 == 0) { if (urlCounter % 500 == 0) {
@ -950,10 +954,10 @@ public final class yacy {
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
if (source.equals("lurl")) { if (source.equals("lurl")) {
Iterator eiter = pool.loadedURL.entries(true, false, null); Iterator eiter = pool.loadedURL.entries(true, false, null);
plasmaCrawlLURL.Entry entry; plasmaCrawlLURLEntry entry;
while (eiter.hasNext()) { while (eiter.hasNext()) {
try { try {
entry = (plasmaCrawlLURL.Entry) eiter.next(); entry = (plasmaCrawlLURLEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), null); if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), null);
} catch (Exception e) { } catch (Exception e) {
// here a MalformedURLException may occur // here a MalformedURLException may occur
@ -1061,9 +1065,9 @@ public final class yacy {
if (source.equals("lurl")) { if (source.equals("lurl")) {
Iterator eiter = pool.loadedURL.entries(true, false, null); Iterator eiter = pool.loadedURL.entries(true, false, null);
plasmaCrawlLURL.Entry entry; plasmaCrawlLURLEntry entry;
while (eiter.hasNext()) { while (eiter.hasNext()) {
entry = (plasmaCrawlLURL.Entry) eiter.next(); entry = (plasmaCrawlLURLEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) { if ((entry != null) && (entry.url() != null)) {
if (html) { if (html) {
bos.write(("<a href=\"" + entry.url() + "\">" + entry.descr() + "</a><br>").getBytes("UTF-8")); bos.write(("<a href=\"" + entry.url() + "\">" + entry.descr() + "</a><br>").getBytes("UTF-8"));
@ -1114,6 +1118,27 @@ public final class yacy {
} }
} }
private static void migratelurls(String homePath) {
File root = new File(homePath);
try {
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), 16000, false, 1000, false, 1000, false, 10000);
kelondroFlexSplitTable fsp = new kelondroFlexSplitTable(new File(root, "DATA//INDEX/PUBLIC/TEXT"), "urls", 1000, -1, plasmaCrawlLURLOldEntry.rowdef, kelondroNaturalOrder.naturalOrder);
Iterator eiter = pool.loadedURL.entries(true, false, null);
plasmaCrawlLURLEntry entry;
while (eiter.hasNext()) {
entry = (plasmaCrawlLURLEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) {
fsp.put(entry.toRowEntry(), entry.loaddate());
}
}
pool.close();
} catch (IOException e) {
e.printStackTrace();
}
}
private static String[] shift(String[] args, int pos, int count) { private static String[] shift(String[] args, int pos, int count) {
String[] newargs = new String[args.length - count]; String[] newargs = new String[args.length - count];
System.arraycopy(args, 0, newargs, 0, pos); System.arraycopy(args, 0, newargs, 0, pos);
@ -1365,6 +1390,8 @@ public final class yacy {
if (args.length == 2) applicationRoot= args[1]; if (args.length == 2) applicationRoot= args[1];
String outfile = "urllist_" + source + "_" + System.currentTimeMillis() + ((html) ? ".html" : ".txt"); String outfile = "urllist_" + source + "_" + System.currentTimeMillis() + ((html) ? ".html" : ".txt");
urllist(applicationRoot, source, html, outfile); urllist(applicationRoot, source, html, outfile);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-migratelurls"))) {
migratelurls(applicationRoot);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) { } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) {
// generate a url list and save it in a file // generate a url list and save it in a file
if (args.length == 2) applicationRoot= args[1]; if (args.length == 2) applicationRoot= args[1];

Loading…
Cancel
Save