- refactoring of plasmaCrawlLURL.Entry to prepare new Entry format

- added test migration method to migrate the old LURL to a new LURL
the new LURL will be splitted into different tables for each month
this solves several problems:
- the biggest table in YaCy is splitted in different parts and can
  also be managed in filesystems that are limited to 2GB
- the oldest entries can easily be identified, used for re-crawl und
  deleted
- The complete database can be limited to a specific size (as wanted many times)


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2755 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 130cc76927
commit a5dd0d41af

@ -56,7 +56,7 @@ import de.anomic.data.listManager;
import de.anomic.data.bookmarksDB.Tag;
import de.anomic.http.httpHeader;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
@ -147,7 +147,7 @@ public class Bookmarks {
bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.getBookmark(urlHash);
if (bookmark == null) {
// try to get the bookmark from the LURL database
plasmaCrawlLURL.Entry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null);
plasmaCrawlLURLEntry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null);
plasmaParserDocument document = null;
if(urlentry != null){
document = switchboard.snippetCache.retrieveDocument(urlentry.url(), true);

@ -61,7 +61,7 @@ import de.anomic.index.indexEntry;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
@ -218,7 +218,7 @@ public class IndexControl_p {
}
if (post.containsKey("urlhashdelete")) {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {
@ -265,7 +265,7 @@ public class IndexControl_p {
HashMap knownURLs = new HashMap();
HashSet unknownURLEntries = new HashSet();
indexEntry iEntry;
plasmaCrawlLURL.Entry lurl;
plasmaCrawlLURLEntry lurl;
while (urlIter.hasNext()) {
iEntry = (indexEntry) urlIter.next();
lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null);
@ -321,7 +321,7 @@ public class IndexControl_p {
URL url = new URL(urlstring);
urlhash = indexURL.urlHash(url);
prop.put("urlhash", urlhash);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
if (entry == null) {
prop.put("urlstring", "unknown url: " + urlstring);
prop.put("urlhash", "");
@ -335,7 +335,7 @@ public class IndexControl_p {
}
if (post.containsKey("urlhashsearch")) {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash);
} else {
@ -351,12 +351,12 @@ public class IndexControl_p {
try {
final Iterator entryIt = switchboard.urlPool.loadedURL.entries(true, true, urlhash);
StringBuffer result = new StringBuffer("Sequential List of URL-Hashes:<br>");
plasmaCrawlLURL.Entry entry;
plasmaCrawlLURLEntry entry;
int i = 0;
int rows = 0, cols = 0;
prop.put("urlhashsimilar", 1);
while (entryIt.hasNext() && i < 256) {
entry = (plasmaCrawlLURL.Entry) entryIt.next();
entry = (plasmaCrawlLURLEntry) entryIt.next();
prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", entry.hash());
cols++;
if (cols==8) {
@ -403,7 +403,7 @@ public class IndexControl_p {
return prop;
}
public static serverObjects genUrlProfile(plasmaSwitchboard switchboard, plasmaCrawlLURL.Entry entry, String urlhash) {
public static serverObjects genUrlProfile(plasmaSwitchboard switchboard, plasmaCrawlLURLEntry entry, String urlhash) {
serverObjects prop = new serverObjects();
if (entry == null) {
prop.put("genUrlProfile", 1);
@ -412,7 +412,7 @@ public class IndexControl_p {
}
URL url = entry.url();
String referrer = null;
plasmaCrawlLURL.Entry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null);
plasmaCrawlLURLEntry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null);
if (le == null) {
referrer = "<unknown>";
} else {
@ -463,7 +463,7 @@ public class IndexControl_p {
while (en.hasNext()) {
xi = (indexEntry) en.next();
uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())};
plasmaCrawlLURL.Entry le = switchboard.urlPool.loadedURL.load(uh[0], null);
plasmaCrawlLURLEntry le = switchboard.urlPool.loadedURL.load(uh[0], null);
if (le == null) {
tm.put(uh[0], uh);
} else {

@ -55,13 +55,13 @@ import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.crawler.plasmaCrawlerException;
import de.anomic.plasma.parser.ParserException;
import de.anomic.plasma.plasmaCrawlLURL.Entry;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -107,7 +107,7 @@ public class ViewFile {
String viewMode = post.get("viewMode","sentences");
// getting the urlEntry that belongs to the url hash
Entry urlEntry = null;
plasmaCrawlLURLEntry urlEntry = null;
urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
if (urlEntry == null) {
prop.put("error",2);

@ -64,7 +64,7 @@ import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCodings;
import de.anomic.server.serverCore;
@ -358,7 +358,7 @@ public class dir {
try {
final URL url = new URL(urlstring);
final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()));
final plasmaCrawlLURL.Entry newEntry = switchboard.urlPool.loadedURL.newEntry(
final plasmaCrawlLURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry(
url, "YaCyShare: " + descr, new Date(), new Date(),
"AAAAAAAAAAAA", /*referrer*/
0, /*copycount*/

@ -51,7 +51,7 @@ import java.util.Date;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURL;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -249,7 +249,7 @@ public final class crawlOrder {
// case where we have already the url loaded;
reason = reasonString;
// send lurl-Entry as response
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null);
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null);
if (entry == null) {
response = "rejected";
lurl = "";

@ -51,7 +51,7 @@ import java.io.IOException;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
@ -124,7 +124,7 @@ public final class crawlReceipt {
prop.put("delay", "3600");
} else if (result.equals("fill")) {
// generating a new loaded URL entry
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.newEntry(propStr, true);
plasmaCrawlLURLEntry entry = switchboard.urlPool.loadedURL.newEntry(propStr, true);
if ((entry == null)||(entry.url()==null)) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT for hash " + receivedUrlhash + " from peer " + iam +
"\n\tURL properties: "+ propStr);

@ -54,7 +54,7 @@ import java.util.Set;
import de.anomic.http.httpHeader;
import de.anomic.index.indexContainer;
import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile;
@ -244,10 +244,10 @@ public final class search {
StringBuffer links = new StringBuffer();
String resource = "";
//plasmaIndexEntry pie;
plasmaCrawlLURL.Entry urlentry;
plasmaCrawlLURLEntry urlentry;
plasmaSnippetCache.Snippet snippet;
while ((acc.hasMoreElements()) && (i < squery.wantedResults)) {
urlentry = acc.nextElement();
urlentry = (plasmaCrawlLURLEntry) acc.nextElement();
if (includesnippet) {
snippet = sb.snippetCache.retrieveSnippet(urlentry.url(), squery.queryHashes, false, 260, 1000);
} else {

@ -48,7 +48,7 @@
import java.io.IOException;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCore;
@ -90,7 +90,7 @@ public final class transferURL {
final int sizeBefore = sb.urlPool.loadedURL.size();
// read the urls from the other properties and store
String urls;
plasmaCrawlLURL.Entry lEntry;
plasmaCrawlLURLEntry lEntry;
for (int i = 0; i < urlc; i++) {
serverCore.checkInterruption();
urls = (String) post.get("url" + i);

@ -57,7 +57,7 @@ import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSearchImages;
import de.anomic.plasma.plasmaSearchPreOrder;
@ -189,7 +189,7 @@ public class yacysearch {
return prop;
}
final String recommendHash = post.get("recommendref", ""); // urlhash
plasmaCrawlLURL.Entry urlentry = sb.urlPool.loadedURL.load(recommendHash, null);
plasmaCrawlLURLEntry urlentry = sb.urlPool.loadedURL.load(recommendHash, null);
if (urlentry != null) {
plasmaParserDocument document = sb.snippetCache.retrieveDocument(urlentry.url(), true);
if (document != null) {

@ -13,6 +13,7 @@ import java.util.Iterator;
import java.util.Random;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroFlexSplitTable;
import de.anomic.kelondro.kelondroFlexTable;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroNaturalOrder;
@ -186,6 +187,10 @@ public class dbtest {
File tablepath = new File(tablename).getParentFile();
table = new kelondroFlexTable(tablepath, new File(tablename).getName(), buffer, preload, testRow, kelondroBase64Order.enhancedCoder);
}
if (dbe.equals("kelondroFlexSplitTable")) {
File tablepath = new File(tablename).getParentFile();
table = new kelondroFlexSplitTable(tablepath, new File(tablename).getName(), buffer, preload, testRow, kelondroBase64Order.enhancedCoder);
}
if (dbe.equals("mysql")) {
table = new dbTable("mysql", testRow);
}
@ -513,6 +518,10 @@ final class dbTable implements kelondroIndex {
}
}
public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
return put(row);
}
public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException {
try {

@ -47,7 +47,7 @@ import de.anomic.yacy.yacySeedDB;
public class indexURL {
// day formatter for entry export
protected static final SimpleDateFormat shortDayFormatter = new SimpleDateFormat("yyyyMMdd");
public static final SimpleDateFormat shortDayFormatter = new SimpleDateFormat("yyyyMMdd");
// statics for value lengths
public static final int urlHashLength = yacySeedDB.commonHashLength; // 12
@ -428,15 +428,6 @@ public class indexURL {
}
}
public void store(kelondroRow.Entry entry, boolean cached) throws IOException {
if ((cached) && (urlIndexCache != null))
synchronized (urlIndexCache) {
urlIndexCache.put(entry);
}
else
urlIndexFile.put(entry);
}
public void flushCacheSome() {
if (urlIndexCache == null) return;
if (urlIndexCache.size() == 0) return;

@ -446,7 +446,7 @@ public class kelondroCollectionIndex {
indexEntry.setCol(idx_col_lastread, kelondroRowCollection.daysSince2000(System.currentTimeMillis()));
indexEntry.setCol(idx_col_lastwrote, kelondroRowCollection.daysSince2000(System.currentTimeMillis()));
index.put(indexEntry);
throw new kelondroException(arrayFile(this.path, this.filenameStub, this.loadfactor, chunksize, partitionnumber, serialnumber).toString(), "array contains wrong row '" + new String(arrayrow.getColBytes(0)) + "', expected is '" + new String(indexrow.getColBytes(idx_col_key)) + "', the row has been fixed");
throw new kelondroException(array.filename, "array contains wrong row '" + new String(arrayrow.getColBytes(0)) + "', expected is '" + new String(indexrow.getColBytes(idx_col_key)) + "', the row has been fixed");
}
int chunkcountInArray = collection.size();
if (chunkcountInArray != chunkcount) {

@ -209,6 +209,11 @@ public class kelondroColumn {
public String toString() {
StringBuffer s = new StringBuffer();
switch (celltype) {
case celltype_undefined:
s.append(nickname);
s.append('-');
s.append(cellwidth);
break;
case celltype_boolean:
s.append("boolean ");
s.append(nickname);

@ -27,6 +27,7 @@ package de.anomic.kelondro;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.Iterator;
public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondroIndex {
@ -137,6 +138,10 @@ public class kelondroFlexTable extends kelondroFlexWidthArray implements kelondr
return super.get(i);
}
public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
return put(row);
}
public synchronized kelondroRow.Entry put(kelondroRow.Entry row) throws IOException {
int i = index.geti(row.getColBytes(0));
if (i < 0) {

@ -51,6 +51,7 @@
package de.anomic.kelondro;
import java.io.IOException;
import java.util.Date;
import java.util.Iterator;
public interface kelondroIndex {
@ -60,6 +61,7 @@ public interface kelondroIndex {
public kelondroRow row() throws IOException;
public kelondroRow.Entry get(byte[] key) throws IOException;
public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException;
public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException;
public kelondroRow.Entry remove(byte[] key) throws IOException;
public Iterator rows(boolean up, boolean rotating, byte[] firstKey) throws IOException;
public void close() throws IOException;

@ -26,6 +26,8 @@
package de.anomic.kelondro;
import java.io.IOException;
import java.util.Date;
import java.util.Iterator;
import java.util.TreeMap;
@ -59,6 +61,10 @@ public class kelondroRAMIndex implements kelondroIndex {
return (kelondroRow.Entry) index.get(key);
}
public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
return put(row);
}
public synchronized Entry put(Entry row) {
return (kelondroRow.Entry) index.put(row.getColBytes(0), row);
}

@ -976,7 +976,7 @@ public class kelondroRecords {
return USAGE.FREEC;
}
private final void dispose(Handle h) throws IOException {
private synchronized final void dispose(Handle h) throws IOException {
// delete element with handle h
// this element is then connected to the deleted-chain and can be
// re-used change counter
@ -1052,7 +1052,7 @@ public class kelondroRecords {
if (markedDeleted.contains(h)) {
// loop detection
this.theLogger.severe("KELONDRO WARNING " + this.filename + ": FREE-Queue contains loops");
return markedDeleted;
return markedDeleted; // TODO: automatic fix
}
markedDeleted.add(h);
seekp = seekpos(h);

@ -25,6 +25,7 @@
package de.anomic.kelondro;
import java.io.IOException;
import java.util.Date;
import java.util.Iterator;
import java.util.Random;
import java.util.TreeSet;
@ -76,6 +77,10 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
return entry;
}
public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
return put(row);
}
public kelondroRow.Entry put(kelondroRow.Entry entry) {
long handle = profile.startWrite();
int index = -1;

@ -47,6 +47,7 @@ package de.anomic.kelondro;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.Iterator;
public class kelondroSplittedTree implements kelondroIndex {
@ -109,6 +110,10 @@ public class kelondroSplittedTree implements kelondroIndex {
return ktfs[partition(key)].get(key);
}
public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
return put(row);
}
public kelondroRow.Entry put(kelondroRow.Entry row) throws IOException {
return ktfs[partition(row.getColBytes(0))].put(row);
}

@ -50,6 +50,7 @@ import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
@ -404,8 +405,12 @@ public class kelondroTree extends kelondroRecords implements kelondroIndex {
return (lc.equals(childn.handle()));
}
// Associates the specified value with the specified key in this map
public kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
return put(row);
}
public kelondroRow.Entry put(kelondroRow.Entry newrow) throws IOException {
// Associates the specified value with the specified key in this map
kelondroRow.Entry result = null;
//writeLock.stay(2000, 1000);
if (newrow.columns() != row().columns()) throw new IllegalArgumentException("put: wrong row length " + newrow.columns() + "; must be " + row().columns());

@ -9,6 +9,7 @@ import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.serverDate;
@ -155,7 +156,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
// we need to import the url
// getting the url entry
plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.load(urlHash, null);
plasmaCrawlLURLEntry urlEntry = this.importUrlDB.load(urlHash, null);
if (urlEntry != null) {
/* write it into the home url db */

@ -61,14 +61,11 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Locale;
import java.util.Properties;
import de.anomic.http.httpc;
import de.anomic.http.httpc.response;
import de.anomic.index.indexEntry;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroRAMIndex;
import de.anomic.kelondro.kelondroRow;
@ -78,7 +75,6 @@ import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCodings;
import de.anomic.server.serverObjects;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.crypt;
import de.anomic.tools.nxTools;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
@ -95,31 +91,15 @@ public final class plasmaCrawlLURL extends indexURL {
private final LinkedList lcrawlResultStack; // 5 - local index: result of local crawling
private final LinkedList gcrawlResultStack; // 6 - local index: triggered external
//public static Set damagedURLS = Collections.synchronizedSet(new HashSet());
public plasmaCrawlLURL(File cachePath, int bufferkb, long preloadTime, boolean newdb) {
super();
kelondroRow rowdef = new kelondroRow(
"String urlhash-" + urlHashLength + ", " + // the url's hash
"String urlstring-" + urlStringLength + ", " + // the url as string
"String urldescr-" + urlDescrLength + ", " + // the description of the url
"Cardinal moddate-" + urlDateLength + " {b64e}, " + // last-modified from the httpd
"Cardinal loaddate-" + urlDateLength + " {b64e}, " + // time when the url was loaded
"String refhash-" + urlHashLength + ", " + // the url's referrer hash
"Cardinal copycount-" + urlCopyCountLength + " {b64e}, " + //
"byte[] flags-" + urlFlagLength + ", " + // flags
"Cardinal quality-" + urlQualityLength + " {b64e}, " + //
"String language-" + urlLanguageLength + ", " + //
"byte[] doctype-" + urlDoctypeLength + ", " + //
"Cardinal size-" + urlSizeLength + " {b64e}, " + // size of file in bytes
"Cardinal wc-" + urlWordCountLength + " {b64e}"); // word count
File cacheFile = new File(cachePath, "urlHash.db");
cacheFile.getParentFile().mkdirs();
try {
urlIndexFile = new kelondroTree(cacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, rowdef);
urlIndexCache = new kelondroRAMIndex(kelondroNaturalOrder.naturalOrder, rowdef);
urlIndexFile = new kelondroTree(cacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, plasmaCrawlLURLOldEntry.rowdef);
urlIndexCache = new kelondroRAMIndex(kelondroNaturalOrder.naturalOrder, plasmaCrawlLURLOldEntry.rowdef);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
@ -134,19 +114,19 @@ public final class plasmaCrawlLURL extends indexURL {
gcrawlResultStack = new LinkedList();
}
public synchronized void stack(Entry e, String initiatorHash, String executorHash, int stackType) {
public synchronized void stack(plasmaCrawlLURLEntry e, String initiatorHash, String executorHash, int stackType) {
if (e == null) { return; }
try {
if (initiatorHash == null) { initiatorHash = dummyHash; }
if (executorHash == null) { executorHash = dummyHash; }
switch (stackType) {
case 0: break;
case 1: externResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 2: searchResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 3: transfResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 4: proxyResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 5: lcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 6: gcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 1: externResultStack.add(e.hash() + initiatorHash + executorHash); break;
case 2: searchResultStack.add(e.hash() + initiatorHash + executorHash); break;
case 3: transfResultStack.add(e.hash() + initiatorHash + executorHash); break;
case 4: proxyResultStack.add(e.hash() + initiatorHash + executorHash); break;
case 5: lcrawlResultStack.add(e.hash() + initiatorHash + executorHash); break;
case 6: gcrawlResultStack.add(e.hash() + initiatorHash + executorHash); break;
}
return;
} catch (Exception ex) {
@ -159,7 +139,7 @@ public final class plasmaCrawlLURL extends indexURL {
gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
}
public Entry load(String urlHash, indexEntry searchedWord) {
public plasmaCrawlLURLEntry load(String urlHash, indexEntry searchedWord) {
// generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
@ -171,19 +151,18 @@ public final class plasmaCrawlLURL extends indexURL {
try {
if (entry == null) entry = urlIndexFile.get(urlHash.getBytes());
if (entry == null) return null;
return new Entry(entry, searchedWord);
return new plasmaCrawlLURLOldEntry(entry, searchedWord);
} catch (IOException e) {
return null;
}
}
public void store(Entry entry, boolean cached) throws IOException {
public void store(plasmaCrawlLURLEntry entry, boolean cached) throws IOException {
// Check if there is a more recent Entry already in the DB
if (entry.stored) return;
Entry oldEntry;
plasmaCrawlLURLEntry oldEntry;
try {
if (exists(entry.urlHash)) {
oldEntry = load(entry.urlHash, null);
if (exists(entry.hash())) {
oldEntry = load(entry.hash(), null);
} else {
oldEntry = null;
}
@ -194,40 +173,32 @@ public final class plasmaCrawlLURL extends indexURL {
// the fetched oldEntry is better, so return its properties instead of the new ones
// this.urlHash = oldEntry.urlHash; // unnecessary, should be the same
// this.url = oldEntry.url; // unnecessary, should be the same
entry.descr = oldEntry.descr;
entry.moddate = oldEntry.moddate;
entry.loaddate = oldEntry.loaddate;
entry.referrerHash = oldEntry.referrerHash;
entry.copyCount = oldEntry.copyCount;
entry.flags = oldEntry.flags;
entry.quality = oldEntry.quality;
entry.language = oldEntry.language;
entry.doctype = oldEntry.doctype;
entry.size = oldEntry.size;
entry.wordCount = oldEntry.wordCount;
// this.snippet // not read from db
// this.word // not read from db
entry.stored = true;
entry = oldEntry;
return; // this did not need to be stored, but is updated
}
super.store(entry.toRowEntry(), cached);
entry.stored = true;
if ((cached) && (urlIndexCache != null)) {
synchronized (urlIndexCache) {
urlIndexCache.put(entry.toRowEntry());
}
} else {
urlIndexFile.put(entry.toRowEntry(), entry.loaddate());
}
}
public synchronized Entry newEntry(String propStr, boolean setGlobal) {
public synchronized plasmaCrawlLURLEntry newEntry(String propStr, boolean setGlobal) {
if (propStr.startsWith("{") && propStr.endsWith("}")) {
return new Entry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)), setGlobal);
return new plasmaCrawlLURLOldEntry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)), setGlobal);
} else {
return null;
}
}
public synchronized Entry newEntry(URL url, String descr, Date moddate, Date loaddate,
public synchronized plasmaCrawlLURLEntry newEntry(URL url, String descr, Date moddate, Date loaddate,
String referrerHash, int copyCount, boolean localNeed,
int quality, String language, char doctype,
int size, int wordCount) {
Entry e = new Entry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount);
plasmaCrawlLURLEntry e = new plasmaCrawlLURLOldEntry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount);
return e;
}
@ -365,7 +336,7 @@ public final class plasmaCrawlLURL extends indexURL {
String urlHash, initiatorHash, executorHash;
String cachepath, urlstr, urltxt;
yacySeed initiatorSeed, executorSeed;
plasmaCrawlLURL.Entry urle;
plasmaCrawlLURLEntry urle;
URL url;
// needed for getCachePath(url)
@ -412,317 +383,6 @@ public final class plasmaCrawlLURL extends indexURL {
return prop;
}
public class Entry {
private URL url;
private String descr;
private Date moddate;
private Date loaddate;
private String urlHash;
private String referrerHash;
private int copyCount;
private String flags;
private int quality;
private String language;
private char doctype;
private int size;
private int wordCount;
private String snippet;
private indexEntry word; // this is only used if the url is transported via remote search requests
private boolean stored;
// more needed attributes:
// - author / copyright owner
// - keywords
// - phrasecount, total number of phrases
// - boolean: URL attributes (see Word-Entity definition)
// - boolean: appearance of bold and/or italics
// - ETag: for re-crawl decision upon HEAD request
// - int: # of outlinks to same domain
// - int: # of outlinks to outside domain
// - int: # of keywords
// - int: # der auf der Seite vorhandenen Links zu image, audio, video, applications
public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) {
// create new entry and store it into database
this.urlHash = urlHash(url);
this.url = url;
this.descr = (descr == null) ? this.url.toString() : descr;
this.moddate = moddate;
this.loaddate = loaddate;
this.referrerHash = (referrerHash == null) ? dummyHash : referrerHash;
this.copyCount = copyCount; // the number of remote (global) copies of this object without this one
this.flags = (localNeed) ? "L " : " ";
this.quality = quality;
this.language = (language == null) ? "uk" : language;
this.doctype = doctype;
this.size = size;
this.wordCount = wordCount;
this.snippet = null;
this.word = null;
this.stored = false;
}
public Entry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException {
try {
this.urlHash = entry.getColString(0, null);
this.url = new URL(entry.getColString(1, "UTF-8").trim());
this.descr = (entry.empty(2)) ? this.url.toString() : entry.getColString(2, "UTF-8").trim();
this.moddate = new Date(86400000 * entry.getColLong(3));
this.loaddate = new Date(86400000 * entry.getColLong(4));
this.referrerHash = (entry.empty(5)) ? dummyHash : entry.getColString(5, "UTF-8");
this.copyCount = (int) entry.getColLong(6);
this.flags = entry.getColString(7, "UTF-8");
this.quality = (int) entry.getColLong(8);
this.language = entry.getColString(9, "UTF-8");
this.doctype = (char) entry.getColByte(10);
this.size = (int) entry.getColLong(11);
this.wordCount = (int) entry.getColLong(12);
this.snippet = null;
this.word = searchedWord;
this.stored = false;
return;
} catch (Exception e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/1: " + e.toString(), e);
throw new IOException("plasmaLURL.entry/1: " + e.toString());
}
}
public Entry(Properties prop, boolean setGlobal) {
// generates an plasmaLURLEntry using the properties from the argument
// the property names must correspond to the one from toString
//System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
this.urlHash = prop.getProperty("hash", dummyHash);
try {
//byte[][] entry = urlHashCache.get(urlHash.getBytes());
//if (entry == null) {
this.referrerHash = prop.getProperty("referrer", dummyHash);
this.moddate = shortDayFormatter.parse(prop.getProperty("mod", "20000101"));
//System.out.println("DEBUG: moddate = " + moddate + ", prop=" + prop.getProperty("mod"));
this.loaddate = shortDayFormatter.parse(prop.getProperty("load", "20000101"));
this.copyCount = Integer.parseInt(prop.getProperty("cc", "0"));
this.flags = ((prop.getProperty("local", "true").equals("true")) ? "L " : " ");
if (setGlobal) this.flags = "G ";
this.url = new URL(crypt.simpleDecode(prop.getProperty("url", ""), null));
this.descr = crypt.simpleDecode(prop.getProperty("descr", ""), null);
if (this.descr == null) this.descr = this.url.toString();
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(prop.getProperty("q", ""));
this.language = prop.getProperty("lang", "uk");
this.doctype = prop.getProperty("dt", "t").charAt(0);
this.size = Integer.parseInt(prop.getProperty("size", "0"));
this.wordCount = Integer.parseInt(prop.getProperty("wc", "0"));
this.snippet = prop.getProperty("snippet", "");
if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null);
this.word = (prop.containsKey("word")) ? new indexURLEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null;
this.stored = false;
//}
} catch (Exception e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/2:" +
"\nProperties: " + ((prop==null)?null:prop.toString()) +
((prop.containsKey("word")) ? "\nWord: " + kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word","")) : "") +
"\nErrorMsg: " + e.toString(), e);
}
}
public kelondroRow.Entry toRowEntry() throws IOException {
final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, urlDateLength);
final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength);
final byte[][] entry = new byte[][] {
urlHash.getBytes(),
url.toString().getBytes(),
descr.getBytes(), // null?
moddatestr.getBytes(),
loaddatestr.getBytes(),
referrerHash.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(copyCount, urlCopyCountLength).getBytes(),
flags.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(quality, urlQualityLength).getBytes(),
language.getBytes(),
new byte[] {(byte) doctype},
kelondroBase64Order.enhancedCoder.encodeLong(size, urlSizeLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(wordCount, urlWordCountLength).getBytes(),
};
return urlIndexFile.row().newEntry(entry);
}
public String hash() {
// return a url-hash, based on the md5 algorithm
// the result is a String of 12 bytes within a 72-bit space
// (each byte has an 6-bit range)
// that should be enough for all web pages on the world
return this.urlHash;
}
public URL url() {
return url;
}
public String descr() {
return descr;
}
public Date moddate() {
return moddate;
}
public Date loaddate() {
return loaddate;
}
public String referrerHash() {
// return the creator's hash
return referrerHash;
}
public char doctype() {
return doctype;
}
public int copyCount() {
// return number of copies of this object in the global index
return copyCount;
}
public boolean local() {
// returns true if the url was created locally and is needed for own word index
if (flags == null) return false;
return flags.charAt(0) == 'L';
}
public int quality() {
return quality;
}
public String language() {
return language;
}
public int size() {
return size;
}
public int wordCount() {
return wordCount;
}
public String snippet() {
// the snippet may appear here if the url was transported in a remote search
// it will not be saved anywhere, but can only be requested here
return snippet;
}
public indexEntry word() {
return word;
}
public boolean isOlder (Entry other) {
if (other == null) return false;
if (moddate.before(other.moddate())) return true;
if (moddate.equals(other.moddate())) {
if (loaddate.before(other.loaddate())) return true;
if (loaddate.equals(other.loaddate())) {
if (quality < other.quality()) return true;
}
}
return false;
}
private StringBuffer corePropList() {
// generate a parseable string; this is a simple property-list
final StringBuffer corePropStr = new StringBuffer(300);
try {
corePropStr
.append("hash=") .append(urlHash)
.append(",referrer=").append(referrerHash)
.append(",mod=") .append(shortDayFormatter.format(moddate))
.append(",load=") .append(shortDayFormatter.format(loaddate))
.append(",size=") .append(size)
.append(",wc=") .append(wordCount)
.append(",cc=") .append(copyCount)
.append(",local=") .append(((local()) ? "true" : "false"))
.append(",q=") .append(kelondroBase64Order.enhancedCoder.encodeLong(quality, urlQualityLength))
.append(",dt=") .append(doctype)
.append(",lang=") .append(language)
.append(",url=") .append(crypt.simpleEncode(url.toString()))
.append(",descr=") .append(crypt.simpleEncode(descr));
if (this.word != null) {
// append also word properties
corePropStr.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toPropertyForm(false)));
}
return corePropStr;
} catch (Exception e) {
// serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
// if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
// if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null");
// e.printStackTrace();
return null;
}
}
/*
public String toString(int posintext, int posinphrase, int posofphrase) {
// add information needed for remote transport
final StringBuffer core = corePropList();
if (core == null) return null;
core.ensureCapacity(core.length() + 200);
core.insert(0,"{")
.append(",posintext=").append(posintext)
.append(",posinphrase=").append(posinphrase)
.append(",posofphraseint=").append(posofphrase)
.append("}");
return core.toString();
}
*/
public String toString(String snippet) {
// add information needed for remote transport
final StringBuffer core = corePropList();
if (core == null) return null;
core.ensureCapacity(core.length() + snippet.length()*2);
core.insert(0,"{");
core.append(",snippet=").append(crypt.simpleEncode(snippet));
core.append("}");
return core.toString();
//return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
}
/**
* Returns this object as String.<br>
* This e.g. looks like this:
* <pre>{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}</pre>
*/
public String toString() {
final StringBuffer core = corePropList();
if (core == null) return null;
core.insert(0,"{");
core.append("}");
return core.toString();
//return "{" + core + "}";
}
public void print() {
System.out.println("URL : " + url);
System.out.println("Description : " + descr);
System.out.println("Modified : " + httpc.dateString(moddate));
System.out.println("Loaded : " + httpc.dateString(loaddate));
System.out.println("Size : " + size + " bytes, " + wordCount + " words");
System.out.println("Referrer Hash : " + referrerHash);
System.out.println("Quality : " + quality);
System.out.println("Language : " + language);
System.out.println("DocType : " + doctype);
System.out.println();
}
} // class Entry
public class kiter implements Iterator {
// enumerates entry elements
Iterator i;
@ -742,7 +402,7 @@ public final class plasmaCrawlLURL extends indexURL {
kelondroRow.Entry e = (kelondroRow.Entry) i.next();
if (e == null) return null;
try {
return new Entry(e, null);
return new plasmaCrawlLURLOldEntry(e, null);
} catch (IOException ex) {
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null));
}
@ -873,7 +533,7 @@ public final class plasmaCrawlLURL extends indexURL {
}
}
plasmaCrawlLURL.Entry entry = (plasmaCrawlLURL.Entry) eiter.next();
plasmaCrawlLURLEntry entry = (plasmaCrawlLURLEntry) eiter.next();
totalSearchedUrls++;
if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, entry.url()) ||
plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, entry.url())) {
@ -944,7 +604,7 @@ public final class plasmaCrawlLURL extends indexURL {
final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), 1, 0, false);
final Iterator enu = urls.entries(true, false, null);
while (enu.hasNext()) {
((Entry) enu.next()).print();
((plasmaCrawlLURLEntry) enu.next()).print();
}
} catch (Exception e) {
e.printStackTrace();

@ -385,7 +385,7 @@ public final class plasmaCrawlStacker {
checkInterruption();
String nexturlhash = indexURL.urlHash(nexturl);
String dbocc = this.sb.urlPool.exists(nexturlhash);
plasmaCrawlLURL.Entry oldEntry = null;
plasmaCrawlLURLEntry oldEntry = null;
oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null);
boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());
if ((dbocc != null) && (!(recrawl))) {

@ -199,7 +199,7 @@ public class plasmaDHTChunk {
indexContainer container;
Iterator urlIter;
indexEntry iEntry;
plasmaCrawlLURL.Entry lurl;
plasmaCrawlLURLEntry lurl;
int refcount = 0;
int wholesize;
@ -281,11 +281,11 @@ public class plasmaDHTChunk {
}
public synchronized int deleteTransferIndexes() {
public synchronized String deleteTransferIndexes() {
Iterator urlIter;
indexEntry iEntry;
HashSet urlHashes;
int count = 0;
String count = "0";
for (int i = 0; i < this.indexContainers.length; i++) {
// delete entries separately
@ -301,7 +301,7 @@ public class plasmaDHTChunk {
urlHashes.add(iEntry.urlHash());
}
String wordHash = indexContainers[i].getWordHash();
count += wordIndex.removeEntries(this.indexContainers[i].getWordHash(), urlHashes, true);
count = wordIndex.removeEntriesExpl(this.indexContainers[i].getWordHash(), urlHashes, true);
if (log.isFine())
log.logFine("Deleted partial index (" + c + " URLs) for word " + wordHash + "; " + this.wordIndex.indexSize(wordHash) + " entries left");
this.indexContainers[i] = null;

@ -222,7 +222,7 @@ public class plasmaDHTFlush extends Thread {
// deleting transfered words from index
if (this.delete) {
this.status = "Running: Deleting chunk " + iteration;
int urlReferences = oldDHTChunk.deleteTransferIndexes();
String urlReferences = oldDHTChunk.deleteTransferIndexes();
this.log.logFine("Deleted from " + oldDHTChunk.containerSize() + " transferred RWIs locally " + urlReferences + " URL references");
}
oldDHTChunk = null;

@ -370,7 +370,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
//if (searchResult.size() == 0) return acc; // case that we have nothing to do
indexEntry entry;
plasmaCrawlLURL.Entry page;
plasmaCrawlLURLEntry page;
Long preranking;
Object[] preorderEntry;
int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT);

@ -101,7 +101,7 @@ public final class plasmaSearchImages {
public plasmaSearchImages(plasmaSnippetCache sc, long maxTime, plasmaSearchResult sres, int depth) {
long start = System.currentTimeMillis();
this.images = new TreeSet();
plasmaCrawlLURL.Entry urlentry;
plasmaCrawlLURLEntry urlentry;
while (sres.hasMoreElements()) {
urlentry = sres.nextElement();
addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), urlentry.url(), depth));

@ -185,7 +185,13 @@ public final class plasmaSearchPreOrder {
public Object[] /*{indexEntry, Long}*/ next() {
String top = (String) pageAcc.firstKey();
//System.out.println("preorder-key: " + top);
Long preranking = new Long(Long.MAX_VALUE - Long.parseLong(top.substring(0, 16), 16)); // java.lang.NumberFormatException: For input string: "8000000000020b17" ???
Long preranking;
try {
preranking = new Long(Long.MAX_VALUE - Long.parseLong(top.substring(0, 16), 16)); // java.lang.NumberFormatException: For input string: "8000000000020b17" ???
} catch (NumberFormatException e) {
e.printStackTrace();
preranking = new Long(0);
}
return new Object[]{(indexEntry) pageAcc.remove(top), preranking};
}

@ -191,7 +191,7 @@ public class plasmaSearchRankingProfile {
Set topwords,
String[] urlcomps,
String[] descrcomps,
plasmaCrawlLURL.Entry page) {
plasmaCrawlLURLEntry page) {
// apply pre-calculated order attributes
long ranking = preranking;

@ -99,13 +99,13 @@ public final class plasmaSearchResult {
return pageAcc.size() > 0;
}
public plasmaCrawlLURL.Entry nextElement() {
public plasmaCrawlLURLEntry nextElement() {
Object top = pageAcc.firstKey();
//System.out.println("postorder-key: " + ((String) top));
return (plasmaCrawlLURL.Entry) pageAcc.remove(top);
return (plasmaCrawlLURLEntry) pageAcc.remove(top);
}
protected void addResult(plasmaCrawlLURL.Entry page, Long preranking) {
protected void addResult(plasmaCrawlLURLEntry page, Long preranking) {
// take out relevant information for reference computation
URL url = page.url();
@ -132,12 +132,12 @@ public final class plasmaSearchResult {
for (int i = 0; i < references.length; i++) commonSense.add(references[i]);
Object[] resultVector;
plasmaCrawlLURL.Entry page;
plasmaCrawlLURLEntry page;
long ranking;
for (int i = 0; i < results.size(); i++) {
// take out values from result array
resultVector = (Object[]) results.get(i);
page = (plasmaCrawlLURL.Entry) resultVector[0];
page = (plasmaCrawlLURLEntry) resultVector[0];
// calculate ranking
if (postsort)
@ -173,7 +173,7 @@ public final class plasmaSearchResult {
// first scan all entries and find all urls that are referenced
while (i.hasNext()) {
entry = (Map.Entry) i.next();
path = urlPath(((plasmaCrawlLURL.Entry) entry.getValue()).url());
path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).url());
paths.put(path, entry.getKey());
//if (path != null) path = shortenPath(path);
//if (path != null) paths.put(path, entry.getKey());
@ -184,7 +184,7 @@ public final class plasmaSearchResult {
String shorten;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
path = urlPath(((plasmaCrawlLURL.Entry) entry.getValue()).url());
path = urlPath(((plasmaCrawlLURLEntry) entry.getValue()).url());
shorten = shortenPath(path);
// scan all subpaths of the url
while (shorten != null) {

@ -629,7 +629,7 @@ public class plasmaSnippetCache {
public void fetch(plasmaSearchResult acc, Set queryhashes, String urlmask, int fetchcount, long maxTime) {
// fetch snippets
int i = 0;
plasmaCrawlLURL.Entry urlentry;
plasmaCrawlLURLEntry urlentry;
String urlstring;
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
while ((acc.hasMoreElements()) && (i < fetchcount) && (System.currentTimeMillis() < limitTime)) {

@ -1011,7 +1011,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// possibly delete entries from last chunk
if ((this.dhtTransferChunk != null) &&
(this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_COMPLETE)) {
int deletedURLs = this.dhtTransferChunk.deleteTransferIndexes();
String deletedURLs = this.dhtTransferChunk.deleteTransferIndexes();
this.log.logFine("Deleted from " + this.dhtTransferChunk.containers().length + " transferred RWIs locally, removed " + deletedURLs + " URL references");
this.dhtTransferChunk = null;
}
@ -1556,7 +1556,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption();
// create a new loaded URL db entry
plasmaCrawlLURL.Entry newEntry = urlPool.loadedURL.newEntry(
plasmaCrawlLURLEntry newEntry = urlPool.loadedURL.newEntry(
entry.url(), // URL
docDescription, // document description
docDate, // modification date
@ -1965,7 +1965,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String lurl = (String) page.get("lurl");
if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.newEntry(propStr, true);
plasmaCrawlLURLEntry entry = urlPool.loadedURL.newEntry(propStr, true);
urlPool.loadedURL.store(entry, false);
urlPool.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
urlPool.noticeURL.remove(entry.hash());
@ -2045,7 +2045,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
int i = 0;
int p;
URL url;
plasmaCrawlLURL.Entry urlentry;
plasmaCrawlLURLEntry urlentry;
String urlstring, urlname, filename, urlhash;
String host, hash, address, descr = "";
yacySeed seed;
@ -2192,7 +2192,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// finally, delete the url entry
// determine the url string
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.load(urlhash, null);
plasmaCrawlLURLEntry entry = urlPool.loadedURL.load(urlhash, null);
if (entry == null) return 0;
URL url = entry.url();

@ -333,7 +333,7 @@ public class plasmaSwitchboardQueue {
public URL referrerURL() {
if (referrerURL == null) {
if ((referrerHash == null) || (referrerHash.equals(indexURL.dummyHash))) return null;
plasmaCrawlLURL.Entry entry = lurls.load(referrerHash, null);
plasmaCrawlLURLEntry entry = lurls.load(referrerHash, null);
if (entry == null) referrerURL = null; else referrerURL = entry.url();
}
return referrerURL;

@ -83,7 +83,7 @@ public class plasmaURLPool {
plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
if (ne != null) return ne.url();
} catch (IOException e) {}
plasmaCrawlLURL.Entry le = loadedURL.load(urlhash, null);
plasmaCrawlLURLEntry le = loadedURL.load(urlhash, null);
if (le != null) return le.url();
plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);
if (ee != null) return ee.url();

@ -484,13 +484,25 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
int removed = 0;
removed += dhtInCache.removeEntries(wordHash, urlHashes, deleteComplete);
removed += dhtOutCache.removeEntries(wordHash, urlHashes, deleteComplete);
if (removed == urlHashes.size()) return removed;
//if (removed == urlHashes.size()) return removed;
if (useCollectionIndex) {
removed += collections.removeEntries(wordHash, urlHashes, deleteComplete);
if (removed == urlHashes.size()) return removed;
//if (removed == urlHashes.size()) return removed;
}
removed += assortmentCluster.removeEntries(wordHash, urlHashes, deleteComplete);
if (removed == urlHashes.size()) return removed;
//if (removed == urlHashes.size()) return removed;
removed += backend.removeEntries(wordHash, urlHashes, deleteComplete);
return removed;
}
public String removeEntriesExpl(String wordHash, Set urlHashes, boolean deleteComplete) {
String removed = "";
removed += dhtInCache.removeEntries(wordHash, urlHashes, deleteComplete) + ", ";
removed += dhtOutCache.removeEntries(wordHash, urlHashes, deleteComplete) + ", ";
if (useCollectionIndex) {
removed += collections.removeEntries(wordHash, urlHashes, deleteComplete) + ", ";
} else removed += "0, ";
removed += assortmentCluster.removeEntries(wordHash, urlHashes, deleteComplete) + ", ";
removed += backend.removeEntries(wordHash, urlHashes, deleteComplete);
return removed;
}
@ -772,7 +784,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
waiter();
entry = (indexEntry) containerIterator.next();
// System.out.println("Wordhash: "+wordHash+" UrlHash: "+entry.getUrlHash());
plasmaCrawlLURL.Entry ue = lurl.load(entry.urlHash(), null);
plasmaCrawlLURLEntry ue = lurl.load(entry.urlHash(), null);
if (ue == null) {
urlHashs.add(entry.urlHash());
} else {

@ -62,6 +62,7 @@ import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSearchTimingProfile;
import de.anomic.plasma.plasmaSnippetCache;
@ -496,7 +497,7 @@ public final class yacyClient {
}
// insert results to containers
plasmaCrawlLURL.Entry urlEntry;
plasmaCrawlLURLEntry urlEntry;
String[] urls = new String[results];
for (int n = 0; n < results; n++) {
// get one single search result
@ -862,7 +863,7 @@ public final class yacyClient {
-er crawlt, Ergebnis erscheint aber unter falschem initiator
*/
public static HashMap crawlReceipt(yacySeed targetSeed, String process, String result, String reason, plasmaCrawlLURL.Entry entry, String wordhashes) {
public static HashMap crawlReceipt(yacySeed targetSeed, String process, String result, String reason, plasmaCrawlLURLEntry entry, String wordhashes) {
if (targetSeed == null) { return null; }
if (yacyCore.seedDB.mySeed == null) { return null; }
if (yacyCore.seedDB.mySeed == targetSeed) { return null; }
@ -981,9 +982,9 @@ public final class yacyClient {
if (uhs.length == 0) { return resultObj; } // all url's known
// extract the urlCache from the result
plasmaCrawlLURL.Entry[] urls = new plasmaCrawlLURL.Entry[uhs.length];
plasmaCrawlLURLEntry[] urls = new plasmaCrawlLURLEntry[uhs.length];
for (int i = 0; i < uhs.length; i++) {
urls[i] = (plasmaCrawlLURL.Entry) urlCache.get(uhs[i]);
urls[i] = (plasmaCrawlLURLEntry) urlCache.get(uhs[i]);
if (urls[i] == null) {
yacyCore.log.logFine("DEBUG transferIndex: requested url hash '" + uhs[i] + "', unknownURL='" + uhss + "'");
}
@ -1092,7 +1093,7 @@ public final class yacyClient {
}
}
private static HashMap transferURL(yacySeed targetSeed, plasmaCrawlLURL.Entry[] urls, boolean gzipBody, int timeout) {
private static HashMap transferURL(yacySeed targetSeed, plasmaCrawlLURLEntry[] urls, boolean gzipBody, int timeout) {
// this post a message to the remote message board
final String address = targetSeed.getAddress();
if (address == null) { return null; }

@ -75,11 +75,15 @@ import de.anomic.index.indexEntry;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroFlexSplitTable;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroMap;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlLURLEntry;
import de.anomic.plasma.plasmaCrawlLURLOldEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURLPool;
@ -730,7 +734,7 @@ public final class yacy {
iEntry = (indexEntry) wordIdxEntries.next();
String urlHash = iEntry.urlHash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
plasmaCrawlLURL.Entry urlEntry = currentUrlDB.load(urlHash, null);
plasmaCrawlLURLEntry urlEntry = currentUrlDB.load(urlHash, null);
urlCounter++;
minimizedUrlDB.store(urlEntry, false);
if (urlCounter % 500 == 0) {
@ -950,10 +954,10 @@ public final class yacy {
long start = System.currentTimeMillis();
if (source.equals("lurl")) {
Iterator eiter = pool.loadedURL.entries(true, false, null);
plasmaCrawlLURL.Entry entry;
plasmaCrawlLURLEntry entry;
while (eiter.hasNext()) {
try {
entry = (plasmaCrawlLURL.Entry) eiter.next();
entry = (plasmaCrawlLURLEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), null);
} catch (Exception e) {
// here a MalformedURLException may occur
@ -1061,9 +1065,9 @@ public final class yacy {
if (source.equals("lurl")) {
Iterator eiter = pool.loadedURL.entries(true, false, null);
plasmaCrawlLURL.Entry entry;
plasmaCrawlLURLEntry entry;
while (eiter.hasNext()) {
entry = (plasmaCrawlLURL.Entry) eiter.next();
entry = (plasmaCrawlLURLEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) {
if (html) {
bos.write(("<a href=\"" + entry.url() + "\">" + entry.descr() + "</a><br>").getBytes("UTF-8"));
@ -1114,6 +1118,27 @@ public final class yacy {
}
}
private static void migratelurls(String homePath) {
File root = new File(homePath);
try {
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), 16000, false, 1000, false, 1000, false, 10000);
kelondroFlexSplitTable fsp = new kelondroFlexSplitTable(new File(root, "DATA//INDEX/PUBLIC/TEXT"), "urls", 1000, -1, plasmaCrawlLURLOldEntry.rowdef, kelondroNaturalOrder.naturalOrder);
Iterator eiter = pool.loadedURL.entries(true, false, null);
plasmaCrawlLURLEntry entry;
while (eiter.hasNext()) {
entry = (plasmaCrawlLURLEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) {
fsp.put(entry.toRowEntry(), entry.loaddate());
}
}
pool.close();
} catch (IOException e) {
e.printStackTrace();
}
}
private static String[] shift(String[] args, int pos, int count) {
String[] newargs = new String[args.length - count];
System.arraycopy(args, 0, newargs, 0, pos);
@ -1365,6 +1390,8 @@ public final class yacy {
if (args.length == 2) applicationRoot= args[1];
String outfile = "urllist_" + source + "_" + System.currentTimeMillis() + ((html) ? ".html" : ".txt");
urllist(applicationRoot, source, html, outfile);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-migratelurls"))) {
migratelurls(applicationRoot);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) {
// generate a url list and save it in a file
if (args.length == 2) applicationRoot= args[1];

Loading…
Cancel
Save