From a5dd0d41afc97f9abd8fcdc53a711d230bd384aa Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 12 Oct 2006 23:14:41 +0000 Subject: [PATCH] - refactoring of plasmaCrawlLURL.Entry to prepare new Entry format - added test migration method to migrate the old LURL to a new LURL the new LURL will be splitted into different tables for each month this solves several problems: - the biggest table in YaCy is splitted in different parts and can also be managed in filesystems that are limited to 2GB - the oldest entries can easily be identified, used for re-crawl und deleted - The complete database can be limited to a specific size (as wanted many times) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2755 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Bookmarks.java | 4 +- htroot/IndexControl_p.java | 20 +- htroot/ViewFile.java | 4 +- htroot/htdocsdefault/dir.java | 4 +- htroot/yacy/crawlOrder.java | 4 +- htroot/yacy/crawlReceipt.java | 4 +- htroot/yacy/search.java | 6 +- htroot/yacy/transferURL.java | 4 +- htroot/yacysearch.java | 4 +- source/dbtest.java | 9 + source/de/anomic/index/indexURL.java | 11 +- .../kelondro/kelondroCollectionIndex.java | 2 +- source/de/anomic/kelondro/kelondroColumn.java | 5 + .../de/anomic/kelondro/kelondroFlexTable.java | 5 + source/de/anomic/kelondro/kelondroIndex.java | 2 + .../de/anomic/kelondro/kelondroRAMIndex.java | 6 + .../de/anomic/kelondro/kelondroRecords.java | 4 +- source/de/anomic/kelondro/kelondroRowSet.java | 5 + .../anomic/kelondro/kelondroSplittedTree.java | 5 + source/de/anomic/kelondro/kelondroTree.java | 7 +- .../plasma/dbImport/plasmaDbImporter.java | 3 +- source/de/anomic/plasma/plasmaCrawlLURL.java | 404 ++---------------- .../de/anomic/plasma/plasmaCrawlStacker.java | 2 +- source/de/anomic/plasma/plasmaDHTChunk.java | 8 +- source/de/anomic/plasma/plasmaDHTFlush.java | 2 +- .../de/anomic/plasma/plasmaSearchEvent.java | 2 +- .../de/anomic/plasma/plasmaSearchImages.java | 2 +- .../anomic/plasma/plasmaSearchPreOrder.java | 8 +- .../plasma/plasmaSearchRankingProfile.java | 2 +- .../de/anomic/plasma/plasmaSearchResult.java | 14 +- .../de/anomic/plasma/plasmaSnippetCache.java | 2 +- .../de/anomic/plasma/plasmaSwitchboard.java | 10 +- .../anomic/plasma/plasmaSwitchboardQueue.java | 2 +- source/de/anomic/plasma/plasmaURLPool.java | 2 +- source/de/anomic/plasma/plasmaWordIndex.java | 20 +- source/de/anomic/yacy/yacyClient.java | 11 +- source/yacy.java | 37 +- 37 files changed, 193 insertions(+), 453 deletions(-) diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index bdd06ec1b..ddc520540 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -56,7 +56,7 @@ import de.anomic.data.listManager; import de.anomic.data.bookmarksDB.Tag; import de.anomic.http.httpHeader; import de.anomic.net.URL; -import de.anomic.plasma.plasmaCrawlLURL; +import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; @@ -147,7 +147,7 @@ public class Bookmarks { bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.getBookmark(urlHash); if (bookmark == null) { // try to get the bookmark from the LURL database - plasmaCrawlLURL.Entry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null); + plasmaCrawlLURLEntry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null); plasmaParserDocument document = null; if(urlentry != null){ document = switchboard.snippetCache.retrieveDocument(urlentry.url(), true); diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 113231e1f..c1c4381aa 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -61,7 +61,7 @@ import de.anomic.index.indexEntry; import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexURL; import de.anomic.net.URL; -import de.anomic.plasma.plasmaCrawlLURL; +import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.urlPattern.plasmaURLPattern; @@ -218,7 +218,7 @@ public class IndexControl_p { } if (post.containsKey("urlhashdelete")) { - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null); + plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null); if (entry == null) { prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); } else { @@ -265,7 +265,7 @@ public class IndexControl_p { HashMap knownURLs = new HashMap(); HashSet unknownURLEntries = new HashSet(); indexEntry iEntry; - plasmaCrawlLURL.Entry lurl; + plasmaCrawlLURLEntry lurl; while (urlIter.hasNext()) { iEntry = (indexEntry) urlIter.next(); lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null); @@ -321,7 +321,7 @@ public class IndexControl_p { URL url = new URL(urlstring); urlhash = indexURL.urlHash(url); prop.put("urlhash", urlhash); - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null); + plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null); if (entry == null) { prop.put("urlstring", "unknown url: " + urlstring); prop.put("urlhash", ""); @@ -335,7 +335,7 @@ public class IndexControl_p { } if (post.containsKey("urlhashsearch")) { - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null); + plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null); if (entry == null) { prop.put("result", "No Entry for URL hash " + urlhash); } else { @@ -351,12 +351,12 @@ public class IndexControl_p { try { final Iterator entryIt = switchboard.urlPool.loadedURL.entries(true, true, urlhash); StringBuffer result = new StringBuffer("Sequential List of URL-Hashes:
"); - plasmaCrawlLURL.Entry entry; + plasmaCrawlLURLEntry entry; int i = 0; int rows = 0, cols = 0; prop.put("urlhashsimilar", 1); while (entryIt.hasNext() && i < 256) { - entry = (plasmaCrawlLURL.Entry) entryIt.next(); + entry = (plasmaCrawlLURLEntry) entryIt.next(); prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", entry.hash()); cols++; if (cols==8) { @@ -403,7 +403,7 @@ public class IndexControl_p { return prop; } - public static serverObjects genUrlProfile(plasmaSwitchboard switchboard, plasmaCrawlLURL.Entry entry, String urlhash) { + public static serverObjects genUrlProfile(plasmaSwitchboard switchboard, plasmaCrawlLURLEntry entry, String urlhash) { serverObjects prop = new serverObjects(); if (entry == null) { prop.put("genUrlProfile", 1); @@ -412,7 +412,7 @@ public class IndexControl_p { } URL url = entry.url(); String referrer = null; - plasmaCrawlLURL.Entry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null); + plasmaCrawlLURLEntry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null); if (le == null) { referrer = ""; } else { @@ -463,7 +463,7 @@ public class IndexControl_p { while (en.hasNext()) { xi = (indexEntry) en.next(); uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())}; - plasmaCrawlLURL.Entry le = switchboard.urlPool.loadedURL.load(uh[0], null); + plasmaCrawlLURLEntry le = switchboard.urlPool.loadedURL.load(uh[0], null); if (le == null) { tm.put(uh[0], uh); } else { diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 8681df3b4..ced7a6386 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -55,13 +55,13 @@ import de.anomic.data.wikiCode; import de.anomic.http.httpHeader; import de.anomic.http.httpc; import de.anomic.net.URL; +import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.cache.IResourceInfo; import de.anomic.plasma.crawler.plasmaCrawlerException; import de.anomic.plasma.parser.ParserException; -import de.anomic.plasma.plasmaCrawlLURL.Entry; import de.anomic.server.serverFileUtils; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -107,7 +107,7 @@ public class ViewFile { String viewMode = post.get("viewMode","sentences"); // getting the urlEntry that belongs to the url hash - Entry urlEntry = null; + plasmaCrawlLURLEntry urlEntry = null; urlEntry = sb.urlPool.loadedURL.load(urlHash, null); if (urlEntry == null) { prop.put("error",2); diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java index 0b7a201e1..eaa44ca34 100644 --- a/htroot/htdocsdefault/dir.java +++ b/htroot/htdocsdefault/dir.java @@ -64,7 +64,7 @@ import de.anomic.index.indexURL; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.net.URL; import de.anomic.plasma.plasmaCondenser; -import de.anomic.plasma.plasmaCrawlLURL; +import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverCodings; import de.anomic.server.serverCore; @@ -358,7 +358,7 @@ public class dir { try { final URL url = new URL(urlstring); final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes())); - final plasmaCrawlLURL.Entry newEntry = switchboard.urlPool.loadedURL.newEntry( + final plasmaCrawlLURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry( url, "YaCyShare: " + descr, new Date(), new Date(), "AAAAAAAAAAAA", /*referrer*/ 0, /*copycount*/ diff --git a/htroot/yacy/crawlOrder.java b/htroot/yacy/crawlOrder.java index a5c638469..b60545195 100644 --- a/htroot/yacy/crawlOrder.java +++ b/htroot/yacy/crawlOrder.java @@ -51,7 +51,7 @@ import java.util.Date; import de.anomic.http.httpHeader; import de.anomic.index.indexURL; import de.anomic.net.URL; -import de.anomic.plasma.plasmaCrawlLURL; +import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -249,7 +249,7 @@ public final class crawlOrder { // case where we have already the url loaded; reason = reasonString; // send lurl-Entry as response - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null); + plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null); if (entry == null) { response = "rejected"; lurl = ""; diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index fef31f6ca..aed450308 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -51,7 +51,7 @@ import java.io.IOException; import de.anomic.http.httpHeader; import de.anomic.index.indexURL; import de.anomic.plasma.plasmaCrawlEURL; -import de.anomic.plasma.plasmaCrawlLURL; +import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; @@ -124,7 +124,7 @@ public final class crawlReceipt { prop.put("delay", "3600"); } else if (result.equals("fill")) { // generating a new loaded URL entry - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.newEntry(propStr, true); + plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr, true); if ((entry == null)||(entry.url()==null)) { log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT for hash " + receivedUrlhash + " from peer " + iam + "\n\tURL properties: "+ propStr); diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 1b94af568..6f687d874 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -54,7 +54,7 @@ import java.util.Set; import de.anomic.http.httpHeader; import de.anomic.index.indexContainer; import de.anomic.index.indexURL; -import de.anomic.plasma.plasmaCrawlLURL; +import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchRankingProfile; @@ -244,10 +244,10 @@ public final class search { StringBuffer links = new StringBuffer(); String resource = ""; //plasmaIndexEntry pie; - plasmaCrawlLURL.Entry urlentry; + plasmaCrawlLURLEntry urlentry; plasmaSnippetCache.Snippet snippet; while ((acc.hasMoreElements()) && (i < squery.wantedResults)) { - urlentry = acc.nextElement(); + urlentry = (plasmaCrawlLURLEntry) acc.nextElement(); if (includesnippet) { snippet = sb.snippetCache.retrieveSnippet(urlentry.url(), squery.queryHashes, false, 260, 1000); } else { diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index 31e4cc1b6..281fd48da 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -48,7 +48,7 @@ import java.io.IOException; import de.anomic.http.httpHeader; -import de.anomic.plasma.plasmaCrawlLURL; +import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.serverCore; @@ -90,7 +90,7 @@ public final class transferURL { final int sizeBefore = sb.urlPool.loadedURL.size(); // read the urls from the other properties and store String urls; - plasmaCrawlLURL.Entry lEntry; + plasmaCrawlLURLEntry lEntry; for (int i = 0; i < urlc; i++) { serverCore.checkInterruption(); urls = (String) post.get("url" + i); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 191fa3d71..71e7f8996 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -57,7 +57,7 @@ import de.anomic.http.httpHeader; import de.anomic.kelondro.kelondroMSetTools; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.net.URL; -import de.anomic.plasma.plasmaCrawlLURL; +import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaSearchImages; import de.anomic.plasma.plasmaSearchPreOrder; @@ -189,7 +189,7 @@ public class yacysearch { return prop; } final String recommendHash = post.get("recommendref", ""); // urlhash - plasmaCrawlLURL.Entry urlentry = sb.urlPool.loadedURL.load(recommendHash, null); + plasmaCrawlLURLEntry urlentry = sb.urlPool.loadedURL.load(recommendHash, null); if (urlentry != null) { plasmaParserDocument document = sb.snippetCache.retrieveDocument(urlentry.url(), true); if (document != null) { diff --git a/source/dbtest.java b/source/dbtest.java index e3b7288d6..bc7f5a5ba 100644 --- a/source/dbtest.java +++ b/source/dbtest.java @@ -13,6 +13,7 @@ import java.util.Iterator; import java.util.Random; import de.anomic.kelondro.kelondroBase64Order; +import de.anomic.kelondro.kelondroFlexSplitTable; import de.anomic.kelondro.kelondroFlexTable; import de.anomic.kelondro.kelondroIndex; import de.anomic.kelondro.kelondroNaturalOrder; @@ -186,6 +187,10 @@ public class dbtest { File tablepath = new File(tablename).getParentFile(); table = new kelondroFlexTable(tablepath, new File(tablename).getName(), buffer, preload, testRow, kelondroBase64Order.enhancedCoder); } + if (dbe.equals("kelondroFlexSplitTable")) { + File tablepath = new File(tablename).getParentFile(); + table = new kelondroFlexSplitTable(tablepath, new File(tablename).getName(), buffer, preload, testRow, kelondroBase64Order.enhancedCoder); + } if (dbe.equals("mysql")) { table = new dbTable("mysql", testRow); } @@ -513,6 +518,10 @@ final class dbTable implements kelondroIndex { } } + public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException { + return put(row); + } + public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException { try { diff --git a/source/de/anomic/index/indexURL.java b/source/de/anomic/index/indexURL.java index 91df5b9fc..12240d564 100644 --- a/source/de/anomic/index/indexURL.java +++ b/source/de/anomic/index/indexURL.java @@ -47,7 +47,7 @@ import de.anomic.yacy.yacySeedDB; public class indexURL { // day formatter for entry export - protected static final SimpleDateFormat shortDayFormatter = new SimpleDateFormat("yyyyMMdd"); + public static final SimpleDateFormat shortDayFormatter = new SimpleDateFormat("yyyyMMdd"); // statics for value lengths public static final int urlHashLength = yacySeedDB.commonHashLength; // 12 @@ -428,15 +428,6 @@ public class indexURL { } } - public void store(kelondroRow.Entry entry, boolean cached) throws IOException { - if ((cached) && (urlIndexCache != null)) - synchronized (urlIndexCache) { - urlIndexCache.put(entry); - } - else - urlIndexFile.put(entry); - } - public void flushCacheSome() { if (urlIndexCache == null) return; if (urlIndexCache.size() == 0) return; diff --git a/source/de/anomic/kelondro/kelondroCollectionIndex.java b/source/de/anomic/kelondro/kelondroCollectionIndex.java index 6dbaa2e2e..68e7da432 100644 --- a/source/de/anomic/kelondro/kelondroCollectionIndex.java +++ b/source/de/anomic/kelondro/kelondroCollectionIndex.java @@ -446,7 +446,7 @@ public class kelondroCollectionIndex { indexEntry.setCol(idx_col_lastread, kelondroRowCollection.daysSince2000(System.currentTimeMillis())); indexEntry.setCol(idx_col_lastwrote, kelondroRowCollection.daysSince2000(System.currentTimeMillis())); index.put(indexEntry); - throw new kelondroException(arrayFile(this.path, this.filenameStub, this.loadfactor, chunksize, partitionnumber, serialnumber).toString(), "array contains wrong row '" + new String(arrayrow.getColBytes(0)) + "', expected is '" + new String(indexrow.getColBytes(idx_col_key)) + "', the row has been fixed"); + throw new kelondroException(array.filename, "array contains wrong row '" + new String(arrayrow.getColBytes(0)) + "', expected is '" + new String(indexrow.getColBytes(idx_col_key)) + "', the row has been fixed"); } int chunkcountInArray = collection.size(); if (chunkcountInArray != chunkcount) { diff --git a/source/de/anomic/kelondro/kelondroColumn.java b/source/de/anomic/kelondro/kelondroColumn.java index c4c211cf1..9b56e923c 100644 --- a/source/de/anomic/kelondro/kelondroColumn.java +++ b/source/de/anomic/kelondro/kelondroColumn.java @@ -209,6 +209,11 @@ public class kelondroColumn { public String toString() { StringBuffer s = new StringBuffer(); switch (celltype) { + case celltype_undefined: + s.append(nickname); + s.append('-'); + s.append(cellwidth); + break; case celltype_boolean: s.append("boolean "); s.append(nickname); diff --git a/source/de/anomic/kelondro/kelondroFlexTable.java b/source/de/anomic/kelondro/kelondroFlexTable.java index d171cab8c..e5bc41edb 100644 --- a/source/de/anomic/kelondro/kelondroFlexTable.java +++ b/source/de/anomic/kelondro/kelondroFlexTable.java @@ -27,6 +27,7 @@ package de.anomic.kelondro; import java.io.File; import java.io.IOException; +import java.util.Date; import java.util.Iterator; public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondroIndex { @@ -137,6 +138,10 @@ public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondr return super.get(i); } + public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException { + return put(row); + } + public synchronized kelondroRow.Entry put(kelondroRow.Entry row) throws IOException { int i = index.geti(row.getColBytes(0)); if (i < 0) { diff --git a/source/de/anomic/kelondro/kelondroIndex.java b/source/de/anomic/kelondro/kelondroIndex.java index bc3b6969b..ca426cfb7 100644 --- a/source/de/anomic/kelondro/kelondroIndex.java +++ b/source/de/anomic/kelondro/kelondroIndex.java @@ -51,6 +51,7 @@ package de.anomic.kelondro; import java.io.IOException; +import java.util.Date; import java.util.Iterator; public interface kelondroIndex { @@ -60,6 +61,7 @@ public interface kelondroIndex { public kelondroRow row() throws IOException; public kelondroRow.Entry get(byte[] key) throws IOException; public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException; + public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException; public kelondroRow.Entry remove(byte[] key) throws IOException; public Iterator rows(boolean up, boolean rotating, byte[] firstKey) throws IOException; public void close() throws IOException; diff --git a/source/de/anomic/kelondro/kelondroRAMIndex.java b/source/de/anomic/kelondro/kelondroRAMIndex.java index df0acd6d0..b7792215f 100644 --- a/source/de/anomic/kelondro/kelondroRAMIndex.java +++ b/source/de/anomic/kelondro/kelondroRAMIndex.java @@ -26,6 +26,8 @@ package de.anomic.kelondro; +import java.io.IOException; +import java.util.Date; import java.util.Iterator; import java.util.TreeMap; @@ -59,6 +61,10 @@ public class kelondroRAMIndex implements kelondroIndex { return (kelondroRow.Entry) index.get(key); } + public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException { + return put(row); + } + public synchronized Entry put(Entry row) { return (kelondroRow.Entry) index.put(row.getColBytes(0), row); } diff --git a/source/de/anomic/kelondro/kelondroRecords.java b/source/de/anomic/kelondro/kelondroRecords.java index 41ae99c5f..34c184d0d 100644 --- a/source/de/anomic/kelondro/kelondroRecords.java +++ b/source/de/anomic/kelondro/kelondroRecords.java @@ -976,7 +976,7 @@ public class kelondroRecords { return USAGE.FREEC; } - private final void dispose(Handle h) throws IOException { + private synchronized final void dispose(Handle h) throws IOException { // delete element with handle h // this element is then connected to the deleted-chain and can be // re-used change counter @@ -1052,7 +1052,7 @@ public class kelondroRecords { if (markedDeleted.contains(h)) { // loop detection this.theLogger.severe("KELONDRO WARNING " + this.filename + ": FREE-Queue contains loops"); - return markedDeleted; + return markedDeleted; // TODO: automatic fix } markedDeleted.add(h); seekp = seekpos(h); diff --git a/source/de/anomic/kelondro/kelondroRowSet.java b/source/de/anomic/kelondro/kelondroRowSet.java index fc00799a1..4eafe2efa 100644 --- a/source/de/anomic/kelondro/kelondroRowSet.java +++ b/source/de/anomic/kelondro/kelondroRowSet.java @@ -25,6 +25,7 @@ package de.anomic.kelondro; import java.io.IOException; +import java.util.Date; import java.util.Iterator; import java.util.Random; import java.util.TreeSet; @@ -76,6 +77,10 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd return entry; } + public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException { + return put(row); + } + public kelondroRow.Entry put(kelondroRow.Entry entry) { long handle = profile.startWrite(); int index = -1; diff --git a/source/de/anomic/kelondro/kelondroSplittedTree.java b/source/de/anomic/kelondro/kelondroSplittedTree.java index 11f7be948..9765d9992 100644 --- a/source/de/anomic/kelondro/kelondroSplittedTree.java +++ b/source/de/anomic/kelondro/kelondroSplittedTree.java @@ -47,6 +47,7 @@ package de.anomic.kelondro; import java.io.File; import java.io.IOException; +import java.util.Date; import java.util.Iterator; public class kelondroSplittedTree implements kelondroIndex { @@ -109,6 +110,10 @@ public class kelondroSplittedTree implements kelondroIndex { return ktfs[partition(key)].get(key); } + public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException { + return put(row); + } + public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException { return ktfs[partition(row.getColBytes(0))].put(row); } diff --git a/source/de/anomic/kelondro/kelondroTree.java b/source/de/anomic/kelondro/kelondroTree.java index aad149e1b..a674df9d7 100644 --- a/source/de/anomic/kelondro/kelondroTree.java +++ b/source/de/anomic/kelondro/kelondroTree.java @@ -50,6 +50,7 @@ import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.RandomAccessFile; +import java.util.Date; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; @@ -404,8 +405,12 @@ public class kelondroTree extends kelondroRecords implements kelondroIndex { return (lc.equals(childn.handle())); } - // Associates the specified value with the specified key in this map + public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException { + return put(row); + } + public kelondroRow.Entry put(kelondroRow.Entry newrow) throws IOException { + // Associates the specified value with the specified key in this map kelondroRow.Entry result = null; //writeLock.stay(2000, 1000); if (newrow.columns() != row().columns()) throw new IllegalArgumentException("put: wrong row length " + newrow.columns() + "; must be " + row().columns()); diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java index b15bcac4c..1f2fe5288 100644 --- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java @@ -9,6 +9,7 @@ import de.anomic.index.indexContainer; import de.anomic.index.indexEntry; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.plasma.plasmaCrawlLURL; +import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndex; import de.anomic.server.serverDate; @@ -155,7 +156,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { // we need to import the url // getting the url entry - plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.load(urlHash, null); + plasmaCrawlLURLEntry urlEntry = this.importUrlDB.load(urlHash, null); if (urlEntry != null) { /* write it into the home url db */ diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 55d9665c1..e05edfcb3 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -61,14 +61,11 @@ import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.Locale; -import java.util.Properties; import de.anomic.http.httpc; import de.anomic.http.httpc.response; import de.anomic.index.indexEntry; import de.anomic.index.indexURL; -import de.anomic.index.indexURLEntry; -import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroRAMIndex; import de.anomic.kelondro.kelondroRow; @@ -78,7 +75,6 @@ import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.serverCodings; import de.anomic.server.serverObjects; import de.anomic.server.logging.serverLog; -import de.anomic.tools.crypt; import de.anomic.tools.nxTools; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacySeed; @@ -94,32 +90,16 @@ public final class plasmaCrawlLURL extends indexURL { private final LinkedList proxyResultStack; // 4 - local index: result of proxy fetch/prefetch private final LinkedList lcrawlResultStack; // 5 - local index: result of local crawling private final LinkedList gcrawlResultStack; // 6 - local index: triggered external - - //public static Set damagedURLS = Collections.synchronizedSet(new HashSet()); public plasmaCrawlLURL(File cachePath, int bufferkb, long preloadTime, boolean newdb) { super(); - kelondroRow rowdef = new kelondroRow( - "String urlhash-" + urlHashLength + ", " + // the url's hash - "String urlstring-" + urlStringLength + ", " + // the url as string - "String urldescr-" + urlDescrLength + ", " + // the description of the url - "Cardinal moddate-" + urlDateLength + " {b64e}, " + // last-modified from the httpd - "Cardinal loaddate-" + urlDateLength + " {b64e}, " + // time when the url was loaded - "String refhash-" + urlHashLength + ", " + // the url's referrer hash - "Cardinal copycount-" + urlCopyCountLength + " {b64e}, " + // - "byte[] flags-" + urlFlagLength + ", " + // flags - "Cardinal quality-" + urlQualityLength + " {b64e}, " + // - "String language-" + urlLanguageLength + ", " + // - "byte[] doctype-" + urlDoctypeLength + ", " + // - "Cardinal size-" + urlSizeLength + " {b64e}, " + // size of file in bytes - "Cardinal wc-" + urlWordCountLength + " {b64e}"); // word count File cacheFile = new File(cachePath, "urlHash.db"); cacheFile.getParentFile().mkdirs(); try { - urlIndexFile = new kelondroTree(cacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, rowdef); - urlIndexCache = new kelondroRAMIndex(kelondroNaturalOrder.naturalOrder, rowdef); + urlIndexFile = new kelondroTree(cacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, plasmaCrawlLURLOldEntry.rowdef); + urlIndexCache = new kelondroRAMIndex(kelondroNaturalOrder.naturalOrder, plasmaCrawlLURLOldEntry.rowdef); } catch (IOException e) { e.printStackTrace(); System.exit(-1); @@ -134,19 +114,19 @@ public final class plasmaCrawlLURL extends indexURL { gcrawlResultStack = new LinkedList(); } - public synchronized void stack(Entry e, String initiatorHash, String executorHash, int stackType) { + public synchronized void stack(plasmaCrawlLURLEntry e, String initiatorHash, String executorHash, int stackType) { if (e == null) { return; } try { if (initiatorHash == null) { initiatorHash = dummyHash; } if (executorHash == null) { executorHash = dummyHash; } switch (stackType) { case 0: break; - case 1: externResultStack.add(e.urlHash + initiatorHash + executorHash); break; - case 2: searchResultStack.add(e.urlHash + initiatorHash + executorHash); break; - case 3: transfResultStack.add(e.urlHash + initiatorHash + executorHash); break; - case 4: proxyResultStack.add(e.urlHash + initiatorHash + executorHash); break; - case 5: lcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break; - case 6: gcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break; + case 1: externResultStack.add(e.hash() + initiatorHash + executorHash); break; + case 2: searchResultStack.add(e.hash() + initiatorHash + executorHash); break; + case 3: transfResultStack.add(e.hash() + initiatorHash + executorHash); break; + case 4: proxyResultStack.add(e.hash() + initiatorHash + executorHash); break; + case 5: lcrawlResultStack.add(e.hash() + initiatorHash + executorHash); break; + case 6: gcrawlResultStack.add(e.hash() + initiatorHash + executorHash); break; } return; } catch (Exception ex) { @@ -159,7 +139,7 @@ public final class plasmaCrawlLURL extends indexURL { gcrawlResultStack.add(urlHash + initiatorHash + executorHash); } - public Entry load(String urlHash, indexEntry searchedWord) { + public plasmaCrawlLURLEntry load(String urlHash, indexEntry searchedWord) { // generates an plasmaLURLEntry using the url hash // to speed up the access, the url-hashes are buffered // in the hash cache. @@ -171,19 +151,18 @@ public final class plasmaCrawlLURL extends indexURL { try { if (entry == null) entry = urlIndexFile.get(urlHash.getBytes()); if (entry == null) return null; - return new Entry(entry, searchedWord); + return new plasmaCrawlLURLOldEntry(entry, searchedWord); } catch (IOException e) { return null; } } - public void store(Entry entry, boolean cached) throws IOException { + public void store(plasmaCrawlLURLEntry entry, boolean cached) throws IOException { // Check if there is a more recent Entry already in the DB - if (entry.stored) return; - Entry oldEntry; + plasmaCrawlLURLEntry oldEntry; try { - if (exists(entry.urlHash)) { - oldEntry = load(entry.urlHash, null); + if (exists(entry.hash())) { + oldEntry = load(entry.hash(), null); } else { oldEntry = null; } @@ -194,40 +173,32 @@ public final class plasmaCrawlLURL extends indexURL { // the fetched oldEntry is better, so return its properties instead of the new ones // this.urlHash = oldEntry.urlHash; // unnecessary, should be the same // this.url = oldEntry.url; // unnecessary, should be the same - entry.descr = oldEntry.descr; - entry.moddate = oldEntry.moddate; - entry.loaddate = oldEntry.loaddate; - entry.referrerHash = oldEntry.referrerHash; - entry.copyCount = oldEntry.copyCount; - entry.flags = oldEntry.flags; - entry.quality = oldEntry.quality; - entry.language = oldEntry.language; - entry.doctype = oldEntry.doctype; - entry.size = oldEntry.size; - entry.wordCount = oldEntry.wordCount; - // this.snippet // not read from db - // this.word // not read from db - entry.stored = true; + entry = oldEntry; return; // this did not need to be stored, but is updated } - super.store(entry.toRowEntry(), cached); - entry.stored = true; + if ((cached) && (urlIndexCache != null)) { + synchronized (urlIndexCache) { + urlIndexCache.put(entry.toRowEntry()); + } + } else { + urlIndexFile.put(entry.toRowEntry(), entry.loaddate()); + } } - - public synchronized Entry newEntry(String propStr, boolean setGlobal) { + + public synchronized plasmaCrawlLURLEntry newEntry(String propStr, boolean setGlobal) { if (propStr.startsWith("{") && propStr.endsWith("}")) { - return new Entry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)), setGlobal); + return new plasmaCrawlLURLOldEntry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)), setGlobal); } else { return null; } } - public synchronized Entry newEntry(URL url, String descr, Date moddate, Date loaddate, + public synchronized plasmaCrawlLURLEntry newEntry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) { - Entry e = new Entry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount); + plasmaCrawlLURLEntry e = new plasmaCrawlLURLOldEntry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount); return e; } @@ -365,7 +336,7 @@ public final class plasmaCrawlLURL extends indexURL { String urlHash, initiatorHash, executorHash; String cachepath, urlstr, urltxt; yacySeed initiatorSeed, executorSeed; - plasmaCrawlLURL.Entry urle; + plasmaCrawlLURLEntry urle; URL url; // needed for getCachePath(url) @@ -412,317 +383,6 @@ public final class plasmaCrawlLURL extends indexURL { return prop; } - public class Entry { - - private URL url; - - private String descr; - private Date moddate; - private Date loaddate; - private String urlHash; - private String referrerHash; - private int copyCount; - private String flags; - private int quality; - private String language; - private char doctype; - private int size; - private int wordCount; - private String snippet; - private indexEntry word; // this is only used if the url is transported via remote search requests - private boolean stored; - - // more needed attributes: - // - author / copyright owner - // - keywords - // - phrasecount, total number of phrases - // - boolean: URL attributes (see Word-Entity definition) - // - boolean: appearance of bold and/or italics - // - ETag: for re-crawl decision upon HEAD request - // - int: # of outlinks to same domain - // - int: # of outlinks to outside domain - // - int: # of keywords - // - int: # der auf der Seite vorhandenen Links zu image, audio, video, applications - - public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) { - // create new entry and store it into database - this.urlHash = urlHash(url); - this.url = url; - this.descr = (descr == null) ? this.url.toString() : descr; - this.moddate = moddate; - this.loaddate = loaddate; - this.referrerHash = (referrerHash == null) ? dummyHash : referrerHash; - this.copyCount = copyCount; // the number of remote (global) copies of this object without this one - this.flags = (localNeed) ? "L " : " "; - this.quality = quality; - this.language = (language == null) ? "uk" : language; - this.doctype = doctype; - this.size = size; - this.wordCount = wordCount; - this.snippet = null; - this.word = null; - this.stored = false; - } - - public Entry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException { - try { - this.urlHash = entry.getColString(0, null); - this.url = new URL(entry.getColString(1, "UTF-8").trim()); - this.descr = (entry.empty(2)) ? this.url.toString() : entry.getColString(2, "UTF-8").trim(); - this.moddate = new Date(86400000 * entry.getColLong(3)); - this.loaddate = new Date(86400000 * entry.getColLong(4)); - this.referrerHash = (entry.empty(5)) ? dummyHash : entry.getColString(5, "UTF-8"); - this.copyCount = (int) entry.getColLong(6); - this.flags = entry.getColString(7, "UTF-8"); - this.quality = (int) entry.getColLong(8); - this.language = entry.getColString(9, "UTF-8"); - this.doctype = (char) entry.getColByte(10); - this.size = (int) entry.getColLong(11); - this.wordCount = (int) entry.getColLong(12); - this.snippet = null; - this.word = searchedWord; - this.stored = false; - return; - } catch (Exception e) { - serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/1: " + e.toString(), e); - throw new IOException("plasmaLURL.entry/1: " + e.toString()); - } - } - - public Entry(Properties prop, boolean setGlobal) { - // generates an plasmaLURLEntry using the properties from the argument - // the property names must correspond to the one from toString - //System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); - this.urlHash = prop.getProperty("hash", dummyHash); - try { - //byte[][] entry = urlHashCache.get(urlHash.getBytes()); - //if (entry == null) { - this.referrerHash = prop.getProperty("referrer", dummyHash); - this.moddate = shortDayFormatter.parse(prop.getProperty("mod", "20000101")); - //System.out.println("DEBUG: moddate = " + moddate + ", prop=" + prop.getProperty("mod")); - this.loaddate = shortDayFormatter.parse(prop.getProperty("load", "20000101")); - this.copyCount = Integer.parseInt(prop.getProperty("cc", "0")); - this.flags = ((prop.getProperty("local", "true").equals("true")) ? "L " : " "); - if (setGlobal) this.flags = "G "; - this.url = new URL(crypt.simpleDecode(prop.getProperty("url", ""), null)); - this.descr = crypt.simpleDecode(prop.getProperty("descr", ""), null); - if (this.descr == null) this.descr = this.url.toString(); - this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(prop.getProperty("q", "")); - this.language = prop.getProperty("lang", "uk"); - this.doctype = prop.getProperty("dt", "t").charAt(0); - this.size = Integer.parseInt(prop.getProperty("size", "0")); - this.wordCount = Integer.parseInt(prop.getProperty("wc", "0")); - this.snippet = prop.getProperty("snippet", ""); - if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null); - this.word = (prop.containsKey("word")) ? new indexURLEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null; - this.stored = false; - //} - } catch (Exception e) { - serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/2:" + - "\nProperties: " + ((prop==null)?null:prop.toString()) + - ((prop.containsKey("word")) ? "\nWord: " + kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word","")) : "") + - "\nErrorMsg: " + e.toString(), e); - } - } - - public kelondroRow.Entry toRowEntry() throws IOException { - final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, urlDateLength); - final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength); - - final byte[][] entry = new byte[][] { - urlHash.getBytes(), - url.toString().getBytes(), - descr.getBytes(), // null? - moddatestr.getBytes(), - loaddatestr.getBytes(), - referrerHash.getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(copyCount, urlCopyCountLength).getBytes(), - flags.getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(quality, urlQualityLength).getBytes(), - language.getBytes(), - new byte[] {(byte) doctype}, - kelondroBase64Order.enhancedCoder.encodeLong(size, urlSizeLength).getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(wordCount, urlWordCountLength).getBytes(), - }; - return urlIndexFile.row().newEntry(entry); - } - - public String hash() { - // return a url-hash, based on the md5 algorithm - // the result is a String of 12 bytes within a 72-bit space - // (each byte has an 6-bit range) - // that should be enough for all web pages on the world - return this.urlHash; - } - - public URL url() { - return url; - } - - public String descr() { - return descr; - } - - public Date moddate() { - return moddate; - } - - public Date loaddate() { - return loaddate; - } - - public String referrerHash() { - // return the creator's hash - return referrerHash; - } - - public char doctype() { - return doctype; - } - - public int copyCount() { - // return number of copies of this object in the global index - return copyCount; - } - - public boolean local() { - // returns true if the url was created locally and is needed for own word index - if (flags == null) return false; - return flags.charAt(0) == 'L'; - } - - public int quality() { - return quality; - } - - public String language() { - return language; - } - - public int size() { - return size; - } - - public int wordCount() { - return wordCount; - } - - public String snippet() { - // the snippet may appear here if the url was transported in a remote search - // it will not be saved anywhere, but can only be requested here - return snippet; - } - - public indexEntry word() { - return word; - } - - public boolean isOlder (Entry other) { - if (other == null) return false; - if (moddate.before(other.moddate())) return true; - if (moddate.equals(other.moddate())) { - if (loaddate.before(other.loaddate())) return true; - if (loaddate.equals(other.loaddate())) { - if (quality < other.quality()) return true; - } - } - return false; - } - - private StringBuffer corePropList() { - // generate a parseable string; this is a simple property-list - final StringBuffer corePropStr = new StringBuffer(300); - try { - corePropStr - .append("hash=") .append(urlHash) - .append(",referrer=").append(referrerHash) - .append(",mod=") .append(shortDayFormatter.format(moddate)) - .append(",load=") .append(shortDayFormatter.format(loaddate)) - .append(",size=") .append(size) - .append(",wc=") .append(wordCount) - .append(",cc=") .append(copyCount) - .append(",local=") .append(((local()) ? "true" : "false")) - .append(",q=") .append(kelondroBase64Order.enhancedCoder.encodeLong(quality, urlQualityLength)) - .append(",dt=") .append(doctype) - .append(",lang=") .append(language) - .append(",url=") .append(crypt.simpleEncode(url.toString())) - .append(",descr=") .append(crypt.simpleEncode(descr)); - - if (this.word != null) { - // append also word properties - corePropStr.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm(false))); - } - return corePropStr; - - } catch (Exception e) { -// serverLog.logFailure("plasmaLURL.corePropList", e.getMessage()); -// if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null"); -// if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null"); -// e.printStackTrace(); - return null; - } - } - - /* - public String toString(int posintext, int posinphrase, int posofphrase) { - // add information needed for remote transport - final StringBuffer core = corePropList(); - if (core == null) return null; - - core.ensureCapacity(core.length() + 200); - core.insert(0,"{") - .append(",posintext=").append(posintext) - .append(",posinphrase=").append(posinphrase) - .append(",posofphraseint=").append(posofphrase) - .append("}"); - return core.toString(); - } - */ - - public String toString(String snippet) { - // add information needed for remote transport - final StringBuffer core = corePropList(); - if (core == null) return null; - - core.ensureCapacity(core.length() + snippet.length()*2); - core.insert(0,"{"); - core.append(",snippet=").append(crypt.simpleEncode(snippet)); - core.append("}"); - - return core.toString(); - //return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}"; - } - - /** - * Returns this object as String.
- * This e.g. looks like this: - *
{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}
- */ - public String toString() { - final StringBuffer core = corePropList(); - if (core == null) return null; - - core.insert(0,"{"); - core.append("}"); - - return core.toString(); - //return "{" + core + "}"; - } - - public void print() { - System.out.println("URL : " + url); - System.out.println("Description : " + descr); - System.out.println("Modified : " + httpc.dateString(moddate)); - System.out.println("Loaded : " + httpc.dateString(loaddate)); - System.out.println("Size : " + size + " bytes, " + wordCount + " words"); - System.out.println("Referrer Hash : " + referrerHash); - System.out.println("Quality : " + quality); - System.out.println("Language : " + language); - System.out.println("DocType : " + doctype); - System.out.println(); - } - } // class Entry - public class kiter implements Iterator { // enumerates entry elements Iterator i; @@ -742,7 +402,7 @@ public final class plasmaCrawlLURL extends indexURL { kelondroRow.Entry e = (kelondroRow.Entry) i.next(); if (e == null) return null; try { - return new Entry(e, null); + return new plasmaCrawlLURLOldEntry(e, null); } catch (IOException ex) { throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null)); } @@ -873,7 +533,7 @@ public final class plasmaCrawlLURL extends indexURL { } } - plasmaCrawlLURL.Entry entry = (plasmaCrawlLURL.Entry) eiter.next(); + plasmaCrawlLURLEntry entry = (plasmaCrawlLURLEntry) eiter.next(); totalSearchedUrls++; if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, entry.url()) || plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, entry.url())) { @@ -944,7 +604,7 @@ public final class plasmaCrawlLURL extends indexURL { final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), 1, 0, false); final Iterator enu = urls.entries(true, false, null); while (enu.hasNext()) { - ((Entry) enu.next()).print(); + ((plasmaCrawlLURLEntry) enu.next()).print(); } } catch (Exception e) { e.printStackTrace(); diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 52171a235..5fd51eec6 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -385,7 +385,7 @@ public final class plasmaCrawlStacker { checkInterruption(); String nexturlhash = indexURL.urlHash(nexturl); String dbocc = this.sb.urlPool.exists(nexturlhash); - plasmaCrawlLURL.Entry oldEntry = null; + plasmaCrawlLURLEntry oldEntry = null; oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null); boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder()); if ((dbocc != null) && (!(recrawl))) { diff --git a/source/de/anomic/plasma/plasmaDHTChunk.java b/source/de/anomic/plasma/plasmaDHTChunk.java index 27f515033..c827ee6af 100644 --- a/source/de/anomic/plasma/plasmaDHTChunk.java +++ b/source/de/anomic/plasma/plasmaDHTChunk.java @@ -199,7 +199,7 @@ public class plasmaDHTChunk { indexContainer container; Iterator urlIter; indexEntry iEntry; - plasmaCrawlLURL.Entry lurl; + plasmaCrawlLURLEntry lurl; int refcount = 0; int wholesize; @@ -281,11 +281,11 @@ public class plasmaDHTChunk { } - public synchronized int deleteTransferIndexes() { + public synchronized String deleteTransferIndexes() { Iterator urlIter; indexEntry iEntry; HashSet urlHashes; - int count = 0; + String count = "0"; for (int i = 0; i < this.indexContainers.length; i++) { // delete entries separately @@ -301,7 +301,7 @@ public class plasmaDHTChunk { urlHashes.add(iEntry.urlHash()); } String wordHash = indexContainers[i].getWordHash(); - count += wordIndex.removeEntries(this.indexContainers[i].getWordHash(), urlHashes, true); + count = wordIndex.removeEntriesExpl(this.indexContainers[i].getWordHash(), urlHashes, true); if (log.isFine()) log.logFine("Deleted partial index (" + c + " URLs) for word " + wordHash + "; " + this.wordIndex.indexSize(wordHash) + " entries left"); this.indexContainers[i] = null; diff --git a/source/de/anomic/plasma/plasmaDHTFlush.java b/source/de/anomic/plasma/plasmaDHTFlush.java index 9284bd254..10d95f563 100644 --- a/source/de/anomic/plasma/plasmaDHTFlush.java +++ b/source/de/anomic/plasma/plasmaDHTFlush.java @@ -222,7 +222,7 @@ public class plasmaDHTFlush extends Thread { // deleting transfered words from index if (this.delete) { this.status = "Running: Deleting chunk " + iteration; - int urlReferences = oldDHTChunk.deleteTransferIndexes(); + String urlReferences = oldDHTChunk.deleteTransferIndexes(); this.log.logFine("Deleted from " + oldDHTChunk.containerSize() + " transferred RWIs locally " + urlReferences + " URL references"); } oldDHTChunk = null; diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 71da580a2..466596b2c 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -370,7 +370,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { //if (searchResult.size() == 0) return acc; // case that we have nothing to do indexEntry entry; - plasmaCrawlLURL.Entry page; + plasmaCrawlLURLEntry page; Long preranking; Object[] preorderEntry; int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT); diff --git a/source/de/anomic/plasma/plasmaSearchImages.java b/source/de/anomic/plasma/plasmaSearchImages.java index d6ea1bd9d..d849b394d 100644 --- a/source/de/anomic/plasma/plasmaSearchImages.java +++ b/source/de/anomic/plasma/plasmaSearchImages.java @@ -101,7 +101,7 @@ public final class plasmaSearchImages { public plasmaSearchImages(plasmaSnippetCache sc, long maxTime, plasmaSearchResult sres, int depth) { long start = System.currentTimeMillis(); this.images = new TreeSet(); - plasmaCrawlLURL.Entry urlentry; + plasmaCrawlLURLEntry urlentry; while (sres.hasMoreElements()) { urlentry = sres.nextElement(); addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), urlentry.url(), depth)); diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java index 18f353dd0..4985859f4 100644 --- a/source/de/anomic/plasma/plasmaSearchPreOrder.java +++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java @@ -185,7 +185,13 @@ public final class plasmaSearchPreOrder { public Object[] /*{indexEntry, Long}*/ next() { String top = (String) pageAcc.firstKey(); //System.out.println("preorder-key: " + top); - Long preranking = new Long(Long.MAX_VALUE - Long.parseLong(top.substring(0, 16), 16)); // java.lang.NumberFormatException: For input string: "8000000000020b17" ??? + Long preranking; + try { + preranking = new Long(Long.MAX_VALUE - Long.parseLong(top.substring(0, 16), 16)); // java.lang.NumberFormatException: For input string: "8000000000020b17" ??? + } catch (NumberFormatException e) { + e.printStackTrace(); + preranking = new Long(0); + } return new Object[]{(indexEntry) pageAcc.remove(top), preranking}; } diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java index 0f04ab5ec..355f60839 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java @@ -191,7 +191,7 @@ public class plasmaSearchRankingProfile { Set topwords, String[] urlcomps, String[] descrcomps, - plasmaCrawlLURL.Entry page) { + plasmaCrawlLURLEntry page) { // apply pre-calculated order attributes long ranking = preranking; diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java index 9bd849ece..0a2234ce3 100644 --- a/source/de/anomic/plasma/plasmaSearchResult.java +++ b/source/de/anomic/plasma/plasmaSearchResult.java @@ -99,13 +99,13 @@ public final class plasmaSearchResult { return pageAcc.size() > 0; } - public plasmaCrawlLURL.Entry nextElement() { + public plasmaCrawlLURLEntry nextElement() { Object top = pageAcc.firstKey(); //System.out.println("postorder-key: " + ((String) top)); - return (plasmaCrawlLURL.Entry) pageAcc.remove(top); + return (plasmaCrawlLURLEntry) pageAcc.remove(top); } - protected void addResult(plasmaCrawlLURL.Entry page, Long preranking) { + protected void addResult(plasmaCrawlLURLEntry page, Long preranking) { // take out relevant information for reference computation URL url = page.url(); @@ -132,12 +132,12 @@ public final class plasmaSearchResult { for (int i = 0; i < references.length; i++) commonSense.add(references[i]); Object[] resultVector; - plasmaCrawlLURL.Entry page; + plasmaCrawlLURLEntry page; long ranking; for (int i = 0; i < results.size(); i++) { // take out values from result array resultVector = (Object[]) results.get(i); - page = (plasmaCrawlLURL.Entry) resultVector[0]; + page = (plasmaCrawlLURLEntry) resultVector[0]; // calculate ranking if (postsort) @@ -173,7 +173,7 @@ public final class plasmaSearchResult { // first scan all entries and find all urls that are referenced while (i.hasNext()) { entry = (Map.Entry) i.next(); - path = urlPath(((plasmaCrawlLURL.Entry) entry.getValue()).url()); + path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).url()); paths.put(path, entry.getKey()); //if (path != null) path = shortenPath(path); //if (path != null) paths.put(path, entry.getKey()); @@ -184,7 +184,7 @@ public final class plasmaSearchResult { String shorten; while (i.hasNext()) { entry = (Map.Entry) i.next(); - path = urlPath(((plasmaCrawlLURL.Entry) entry.getValue()).url()); + path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).url()); shorten = shortenPath(path); // scan all subpaths of the url while (shorten != null) { diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 6ee1f2de8..e6e6516aa 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -629,7 +629,7 @@ public class plasmaSnippetCache { public void fetch(plasmaSearchResult acc, Set queryhashes, String urlmask, int fetchcount, long maxTime) { // fetch snippets int i = 0; - plasmaCrawlLURL.Entry urlentry; + plasmaCrawlLURLEntry urlentry; String urlstring; long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) { diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index bc9ed397c..4f1e77eab 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1011,7 +1011,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // possibly delete entries from last chunk if ((this.dhtTransferChunk != null) && (this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_COMPLETE)) { - int deletedURLs = this.dhtTransferChunk.deleteTransferIndexes(); + String deletedURLs = this.dhtTransferChunk.deleteTransferIndexes(); this.log.logFine("Deleted from " + this.dhtTransferChunk.containers().length + " transferred RWIs locally, removed " + deletedURLs + " URL references"); this.dhtTransferChunk = null; } @@ -1556,7 +1556,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser checkInterruption(); // create a new loaded URL db entry - plasmaCrawlLURL.Entry newEntry = urlPool.loadedURL.newEntry( + plasmaCrawlLURLEntry newEntry = urlPool.loadedURL.newEntry( entry.url(), // URL docDescription, // document description docDate, // modification date @@ -1965,7 +1965,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String lurl = (String) page.get("lurl"); if ((lurl != null) && (lurl.length() != 0)) { String propStr = crypt.simpleDecode(lurl, (String) page.get("key")); - plasmaCrawlLURL.Entry entry = urlPool.loadedURL.newEntry(propStr, true); + plasmaCrawlLURLEntry entry = urlPool.loadedURL.newEntry(propStr, true); urlPool.loadedURL.store(entry, false); urlPool.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt? urlPool.noticeURL.remove(entry.hash()); @@ -2045,7 +2045,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser int i = 0; int p; URL url; - plasmaCrawlLURL.Entry urlentry; + plasmaCrawlLURLEntry urlentry; String urlstring, urlname, filename, urlhash; String host, hash, address, descr = ""; yacySeed seed; @@ -2192,7 +2192,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // finally, delete the url entry // determine the url string - plasmaCrawlLURL.Entry entry = urlPool.loadedURL.load(urlhash, null); + plasmaCrawlLURLEntry entry = urlPool.loadedURL.load(urlhash, null); if (entry == null) return 0; URL url = entry.url(); diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java index 51d66e748..cfe7b1391 100644 --- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java +++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java @@ -333,7 +333,7 @@ public class plasmaSwitchboardQueue { public URL referrerURL() { if (referrerURL == null) { if ((referrerHash == null) || (referrerHash.equals(indexURL.dummyHash))) return null; - plasmaCrawlLURL.Entry entry = lurls.load(referrerHash, null); + plasmaCrawlLURLEntry entry = lurls.load(referrerHash, null); if (entry == null) referrerURL = null; else referrerURL = entry.url(); } return referrerURL; diff --git a/source/de/anomic/plasma/plasmaURLPool.java b/source/de/anomic/plasma/plasmaURLPool.java index eaa6f1cca..d1d4e0940 100644 --- a/source/de/anomic/plasma/plasmaURLPool.java +++ b/source/de/anomic/plasma/plasmaURLPool.java @@ -83,7 +83,7 @@ public class plasmaURLPool { plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash); if (ne != null) return ne.url(); } catch (IOException e) {} - plasmaCrawlLURL.Entry le = loadedURL.load(urlhash, null); + plasmaCrawlLURLEntry le = loadedURL.load(urlhash, null); if (le != null) return le.url(); plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash); if (ee != null) return ee.url(); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index df03367d6..bcf518db2 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -484,13 +484,25 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { int removed = 0; removed += dhtInCache.removeEntries(wordHash, urlHashes, deleteComplete); removed += dhtOutCache.removeEntries(wordHash, urlHashes, deleteComplete); - if (removed == urlHashes.size()) return removed; + //if (removed == urlHashes.size()) return removed; if (useCollectionIndex) { removed += collections.removeEntries(wordHash, urlHashes, deleteComplete); - if (removed == urlHashes.size()) return removed; + //if (removed == urlHashes.size()) return removed; } removed += assortmentCluster.removeEntries(wordHash, urlHashes, deleteComplete); - if (removed == urlHashes.size()) return removed; + //if (removed == urlHashes.size()) return removed; + removed += backend.removeEntries(wordHash, urlHashes, deleteComplete); + return removed; + } + + public String removeEntriesExpl(String wordHash, Set urlHashes, boolean deleteComplete) { + String removed = ""; + removed += dhtInCache.removeEntries(wordHash, urlHashes, deleteComplete) + ", "; + removed += dhtOutCache.removeEntries(wordHash, urlHashes, deleteComplete) + ", "; + if (useCollectionIndex) { + removed += collections.removeEntries(wordHash, urlHashes, deleteComplete) + ", "; + } else removed += "0, "; + removed += assortmentCluster.removeEntries(wordHash, urlHashes, deleteComplete) + ", "; removed += backend.removeEntries(wordHash, urlHashes, deleteComplete); return removed; } @@ -772,7 +784,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { waiter(); entry = (indexEntry) containerIterator.next(); // System.out.println("Wordhash: "+wordHash+" UrlHash: "+entry.getUrlHash()); - plasmaCrawlLURL.Entry ue = lurl.load(entry.urlHash(), null); + plasmaCrawlLURLEntry ue = lurl.load(entry.urlHash(), null); if (ue == null) { urlHashs.add(entry.urlHash()); } else { diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 6da6f90a7..61a5009cb 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -62,6 +62,7 @@ import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.net.URL; import de.anomic.plasma.plasmaCrawlLURL; +import de.anomic.plasma.plasmaCrawlLURLEntry; import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchTimingProfile; import de.anomic.plasma.plasmaSnippetCache; @@ -496,7 +497,7 @@ public final class yacyClient { } // insert results to containers - plasmaCrawlLURL.Entry urlEntry; + plasmaCrawlLURLEntry urlEntry; String[] urls = new String[results]; for (int n = 0; n < results; n++) { // get one single search result @@ -862,7 +863,7 @@ public final class yacyClient { -er crawlt, Ergebnis erscheint aber unter falschem initiator */ - public static HashMap crawlReceipt(yacySeed targetSeed, String process, String result, String reason, plasmaCrawlLURL.Entry entry, String wordhashes) { + public static HashMap crawlReceipt(yacySeed targetSeed, String process, String result, String reason, plasmaCrawlLURLEntry entry, String wordhashes) { if (targetSeed == null) { return null; } if (yacyCore.seedDB.mySeed == null) { return null; } if (yacyCore.seedDB.mySeed == targetSeed) { return null; } @@ -981,9 +982,9 @@ public final class yacyClient { if (uhs.length == 0) { return resultObj; } // all url's known // extract the urlCache from the result - plasmaCrawlLURL.Entry[] urls = new plasmaCrawlLURL.Entry[uhs.length]; + plasmaCrawlLURLEntry[] urls = new plasmaCrawlLURLEntry[uhs.length]; for (int i = 0; i < uhs.length; i++) { - urls[i] = (plasmaCrawlLURL.Entry) urlCache.get(uhs[i]); + urls[i] = (plasmaCrawlLURLEntry) urlCache.get(uhs[i]); if (urls[i] == null) { yacyCore.log.logFine("DEBUG transferIndex: requested url hash '" + uhs[i] + "', unknownURL='" + uhss + "'"); } @@ -1092,7 +1093,7 @@ public final class yacyClient { } } - private static HashMap transferURL(yacySeed targetSeed, plasmaCrawlLURL.Entry[] urls, boolean gzipBody, int timeout) { + private static HashMap transferURL(yacySeed targetSeed, plasmaCrawlLURLEntry[] urls, boolean gzipBody, int timeout) { // this post a message to the remote message board final String address = targetSeed.getAddress(); if (address == null) { return null; } diff --git a/source/yacy.java b/source/yacy.java index 8a4b8e25e..f937f3002 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -75,11 +75,15 @@ import de.anomic.index.indexEntry; import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexURL; import de.anomic.kelondro.kelondroDyn; +import de.anomic.kelondro.kelondroFlexSplitTable; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMap; +import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.net.URL; import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlLURL; +import de.anomic.plasma.plasmaCrawlLURLEntry; +import de.anomic.plasma.plasmaCrawlLURLOldEntry; import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaURLPool; @@ -730,7 +734,7 @@ public final class yacy { iEntry = (indexEntry) wordIdxEntries.next(); String urlHash = iEntry.urlHash(); if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try { - plasmaCrawlLURL.Entry urlEntry = currentUrlDB.load(urlHash, null); + plasmaCrawlLURLEntry urlEntry = currentUrlDB.load(urlHash, null); urlCounter++; minimizedUrlDB.store(urlEntry, false); if (urlCounter % 500 == 0) { @@ -950,10 +954,10 @@ public final class yacy { long start = System.currentTimeMillis(); if (source.equals("lurl")) { Iterator eiter = pool.loadedURL.entries(true, false, null); - plasmaCrawlLURL.Entry entry; + plasmaCrawlLURLEntry entry; while (eiter.hasNext()) { try { - entry = (plasmaCrawlLURL.Entry) eiter.next(); + entry = (plasmaCrawlLURLEntry) eiter.next(); if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), null); } catch (Exception e) { // here a MalformedURLException may occur @@ -1061,9 +1065,9 @@ public final class yacy { if (source.equals("lurl")) { Iterator eiter = pool.loadedURL.entries(true, false, null); - plasmaCrawlLURL.Entry entry; + plasmaCrawlLURLEntry entry; while (eiter.hasNext()) { - entry = (plasmaCrawlLURL.Entry) eiter.next(); + entry = (plasmaCrawlLURLEntry) eiter.next(); if ((entry != null) && (entry.url() != null)) { if (html) { bos.write(("" + entry.descr() + "
").getBytes("UTF-8")); @@ -1114,6 +1118,27 @@ public final class yacy { } } + private static void migratelurls(String homePath) { + File root = new File(homePath); + try { + plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), 16000, false, 1000, false, 1000, false, 10000); + kelondroFlexSplitTable fsp = new kelondroFlexSplitTable(new File(root, "DATA//INDEX/PUBLIC/TEXT"), "urls", 1000, -1, plasmaCrawlLURLOldEntry.rowdef, kelondroNaturalOrder.naturalOrder); + + Iterator eiter = pool.loadedURL.entries(true, false, null); + plasmaCrawlLURLEntry entry; + while (eiter.hasNext()) { + entry = (plasmaCrawlLURLEntry) eiter.next(); + if ((entry != null) && (entry.url() != null)) { + fsp.put(entry.toRowEntry(), entry.loaddate()); + } + } + + pool.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + private static String[] shift(String[] args, int pos, int count) { String[] newargs = new String[args.length - count]; System.arraycopy(args, 0, newargs, 0, pos); @@ -1365,6 +1390,8 @@ public final class yacy { if (args.length == 2) applicationRoot= args[1]; String outfile = "urllist_" + source + "_" + System.currentTimeMillis() + ((html) ? ".html" : ".txt"); urllist(applicationRoot, source, html, outfile); + } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-migratelurls"))) { + migratelurls(applicationRoot); } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) { // generate a url list and save it in a file if (args.length == 2) applicationRoot= args[1];