refactoring: better data capsulation for indexURL

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2131 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent fd27d55385
commit 757ec28430

@ -176,22 +176,22 @@ public class PerformanceMemory_p {
ost = sb.cacheManager.dbCacheObjectStatus();
putprop(prop, env, "HTTP", set);
req = sb.urlPool.loadedURL.urlHashCache.size();
chk = sb.urlPool.loadedURL.urlHashCache.cacheNodeChunkSize();
slt = sb.urlPool.loadedURL.urlHashCache.cacheNodeFillStatus();
ost = sb.urlPool.loadedURL.urlHashCache.cacheObjectStatus();
req = sb.urlPool.loadedURL.size();
chk = sb.urlPool.loadedURL.cacheNodeChunkSize();
slt = sb.urlPool.loadedURL.cacheNodeFillStatus();
ost = sb.urlPool.loadedURL.cacheObjectStatus();
putprop(prop, env, "LURL", set);
req = sb.urlPool.noticeURL.urlHashCache.size();
chk = sb.urlPool.noticeURL.urlHashCache.cacheNodeChunkSize();
slt = sb.urlPool.noticeURL.urlHashCache.cacheNodeFillStatus();
ost = sb.urlPool.noticeURL.urlHashCache.cacheObjectStatus();
req = sb.urlPool.noticeURL.size();
chk = sb.urlPool.noticeURL.cacheNodeChunkSize();
slt = sb.urlPool.noticeURL.cacheNodeFillStatus();
ost = sb.urlPool.noticeURL.cacheObjectStatus();
putprop(prop, env, "NURL", set);
req = sb.urlPool.errorURL.urlHashCache.size();
chk = sb.urlPool.errorURL.urlHashCache.cacheNodeChunkSize();
slt = sb.urlPool.errorURL.urlHashCache.cacheNodeFillStatus();
ost = sb.urlPool.errorURL.urlHashCache.cacheObjectStatus();
req = sb.urlPool.errorURL.size();
chk = sb.urlPool.errorURL.cacheNodeChunkSize();
slt = sb.urlPool.errorURL.cacheNodeFillStatus();
ost = sb.urlPool.errorURL.cacheObjectStatus();
putprop(prop, env, "EURL", set);
req = yacyCore.seedDB.sizeConnected() + yacyCore.seedDB.sizeDisconnected() + yacyCore.seedDB.sizePotential();

@ -382,8 +382,8 @@ public class indexURL {
// the class object
public kelondroTree urlHashCache;
public final HashMap existsIndex; // allow subclasses to access the existsIndex during Entry.store()
protected kelondroTree urlHashCache;
protected final HashMap existsIndex; // allow subclasses to access the existsIndex during Entry.store()
public indexURL() {
urlHashCache = null;
@ -398,6 +398,18 @@ public class indexURL {
if (urlHashCache != null) urlHashCache.close();
}
public int[] cacheNodeChunkSize() {
return urlHashCache.cacheNodeChunkSize();
}
public int[] cacheNodeFillStatus() {
return urlHashCache.cacheNodeFillStatus();
}
public String[] cacheObjectStatus() {
return urlHashCache.cacheObjectStatus();
}
public boolean exists(String urlHash) {
synchronized (existsIndex) {
Boolean existsInIndex = (Boolean) existsIndex.get(urlHash);

@ -134,15 +134,15 @@ public class plasmaCrawlEURL extends indexURL {
public class Entry {
private String hash; // the url's hash
private String hash; // the url's hash
private String referrer; // the url's referrer hash
private String initiator; // the crawling initiator
private String executor; // the crawling initiator
private URL url; // the url as string
private String name; // the name of the url, from anchor tag <a>name</a>
private String name; // the name of the url, from anchor tag <a>name</a>
private Date initdate; // the time when the url was first time appeared
private Date trydate; // the time when the url was last time tried to load
private int trycount; // number of tryings
private int trycount; // number of tryings
private String failreason; // string describing reason for load fail
private bitfield flags; // extra space

@ -59,12 +59,14 @@ import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Locale;
import java.util.Properties;
import de.anomic.http.httpc;
import de.anomic.http.httpc.response;
import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroTree;
@ -132,28 +134,6 @@ public final class plasmaCrawlLURL extends indexURL {
lcrawlResultStack = new LinkedList();
gcrawlResultStack = new LinkedList();
}
/*
public synchronized Entry addEntry(URL url, String descr, Date moddate, Date loaddate,
String initiatorHash, String executorHash,
String referrerHash, int copyCount, boolean localNeed,
int quality, String language, char doctype,
int size, int wordCount, int stackType) {
Entry e = new Entry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount);
if (initiatorHash == null) { initiatorHash = dummyHash; }
if (executorHash == null) { executorHash = dummyHash; }
switch (stackType) {
case 0: break;
case 1: externResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 2: searchResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 3: transfResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 4: proxyResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 5: lcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 6: gcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break;
}
return e;
}
*/
public synchronized void stackEntry(Entry e, String initiatorHash, String executorHash, int stackType) {
if (e == null) { return; }
@ -794,6 +774,79 @@ public final class plasmaCrawlLURL extends indexURL {
// enumerates entry elements
return new kiter(up, rotating);
}
/**
* Uses an Iteration over urlHash.db to detect malformed URL-Entries.
* Damaged URL-Entries will be marked in a HashSet and removed at the end of the function.
*
* @param homePath Root-Path where all information is to be found.
*/
public void urldbcleanup() {
serverLog log = new serverLog("URLDBCLEANUP");
HashSet damagedURLS = new HashSet();
try {
Iterator eiter = entries(true, false);
int iteratorCount = 0;
while (eiter.hasNext()) try {
eiter.next();
iteratorCount++;
} catch (RuntimeException e) {
String m = e.getMessage();
damagedURLS.add(m.substring(m.length() - 12));
}
try { Thread.sleep(1000); } catch (InterruptedException e) { }
log.logInfo("URLs vorher: " + size() + " Entries loaded during Iteratorloop: " + iteratorCount + " kaputte URLs: " + damagedURLS.size());
Iterator eiter2 = damagedURLS.iterator();
String urlHash;
while (eiter2.hasNext()) {
urlHash = (String) eiter2.next();
// trying to fix the invalid URL
httpc theHttpc = null;
String oldUrlStr = null;
try {
// getting the url data as byte array
byte[][] entry = urlHashCache.get(urlHash.getBytes());
// getting the wrong url string
oldUrlStr = new String(entry[1]).trim();
int pos = -1;
if ((pos = oldUrlStr.indexOf("://")) != -1) {
// trying to correct the url
String newUrlStr = "http://" + oldUrlStr.substring(pos + 3);
URL newUrl = new URL(newUrlStr);
// doing a http head request to test if the url is correct
theHttpc = httpc.getInstance(newUrl.getHost(), newUrl.getHost(), newUrl.getPort(), 30000, false);
response res = theHttpc.HEAD(newUrl.getPath(), null);
if (res.statusCode == 200) {
entry[1] = newUrl.toString().getBytes();
urlHashCache.put(entry);
log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr);
} else {
remove(urlHash);
log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' removed\n\tURL: " + oldUrlStr + "\n\tConnection Status: " + res.status);
}
}
} catch (Exception e) {
remove(urlHash);
log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' removed\n\tURL: " + oldUrlStr + "\n\tExecption: " + e.getMessage());
} finally {
if (theHttpc != null) try {
theHttpc.close();
httpc.returnInstance(theHttpc);
} catch (Exception e) { }
}
}
log.logInfo("URLs nachher: " + size() + " kaputte URLs: " + damagedURLS.size());
} catch (IOException e) {
log.logSevere("IOException", e);
}
}
// The Cleaner class was provided as "UrldbCleaner" by Hydrox
// see http://www.yacy-forum.de/viewtopic.php?p=18093#18093

@ -71,7 +71,6 @@ import de.anomic.http.httpc;
import de.anomic.http.httpd;
import de.anomic.http.httpdFileHandler;
import de.anomic.http.httpdProxyHandler;
import de.anomic.http.httpc.response;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
import de.anomic.kelondro.kelondroDyn;
@ -1287,67 +1286,9 @@ public final class yacy {
File root = new File(homePath);
File dbroot = new File(root, "DATA/PLASMADB");
serverLog log = new serverLog("URLDBCLEANUP");
HashSet damagedURLS = new HashSet();
try {
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(new File(dbroot, "urlHash.db"), 4194304);
Iterator eiter = currentUrlDB.entries(true, false);
int iteratorCount = 0;
while (eiter.hasNext()) try {
eiter.next();
iteratorCount++;
} catch (RuntimeException e) {
String m = e.getMessage();
damagedURLS.add(m.substring(m.length() - 12));
}
try { Thread.sleep(1000); } catch (InterruptedException e) { }
log.logInfo("URLs vorher: " + currentUrlDB.size() + " Entries loaded during Iteratorloop: " + iteratorCount + " kaputte URLs: " + damagedURLS.size());
Iterator eiter2 = damagedURLS.iterator();
String urlHash;
while (eiter2.hasNext()) {
urlHash = (String) eiter2.next();
// trying to fix the invalid URL
httpc theHttpc = null;
String oldUrlStr = null;
try {
// getting the url data as byte array
byte[][] entry = currentUrlDB.urlHashCache.get(urlHash.getBytes());
// getting the wrong url string
oldUrlStr = new String(entry[1]).trim();
int pos = -1;
if ((pos = oldUrlStr.indexOf("://")) != -1) {
// trying to correct the url
String newUrlStr = "http://" + oldUrlStr.substring(pos + 3);
URL newUrl = new URL(newUrlStr);
// doing a http head request to test if the url is correct
theHttpc = httpc.getInstance(newUrl.getHost(), newUrl.getHost(), newUrl.getPort(), 30000, false);
response res = theHttpc.HEAD(newUrl.getPath(), null);
if (res.statusCode == 200) {
entry[1] = newUrl.toString().getBytes();
currentUrlDB.urlHashCache.put(entry);
log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr);
} else {
currentUrlDB.remove(urlHash);
log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' removed\n\tURL: " + oldUrlStr + "\n\tConnection Status: " + res.status);
}
}
} catch (Exception e) {
currentUrlDB.remove(urlHash);
log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' removed\n\tURL: " + oldUrlStr + "\n\tExecption: " + e.getMessage());
} finally {
if (theHttpc != null) try {
theHttpc.close();
httpc.returnInstance(theHttpc);
} catch (Exception e) { }
}
}
log.logInfo("URLs nachher: " + currentUrlDB.size() + " kaputte URLs: " + damagedURLS.size());
currentUrlDB.urldbcleanup();
currentUrlDB.close();
} catch (IOException e) {
log.logSevere("IOException", e);

Loading…
Cancel
Save