fixed some bugs concerning url entry retrieval and intexControl interface

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1212 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 83a34b838d
commit 4500506735

@ -97,7 +97,7 @@ public class CacheAdmin_p {
// generate sorted dir/file listing // generate sorted dir/file listing
final String[] list = dir.list(); final String[] list = dir.list();
final StringBuffer tree = new StringBuffer((list.length + 2) * 256); final StringBuffer tree = new StringBuffer((list == null) ? 70 : (list.length + 2) * 256);
tree.append("Directory of<br>").append((pathString.length() == 0) ? "domain list" : linkPathString(pathString)).append("<br><br>"); tree.append("Directory of<br>").append((pathString.length() == 0) ? "domain list" : linkPathString(pathString)).append("<br><br>");
if (list == null) { if (list == null) {
tree.append("[empty]"); tree.append("[empty]");

@ -47,6 +47,7 @@
// if the shell's current path is HTROOT // if the shell's current path is HTROOT
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.HashSet; import java.util.HashSet;
@ -91,10 +92,10 @@ public class IndexControl_p {
} }
// default values // default values
String keystring = ((String) post.get("keystring")).trim(); String keystring = ((String) post.get("keystring", "")).trim();
String keyhash = ((String) post.get("keyhash")).trim(); String keyhash = ((String) post.get("keyhash", "")).trim();
String urlstring = ((String) post.get("urlstring")).trim(); String urlstring = ((String) post.get("urlstring", "")).trim();
String urlhash = ((String) post.get("urlhash")).trim(); String urlhash = ((String) post.get("urlhash", "")).trim();
if (!urlstring.startsWith("http://") && if (!urlstring.startsWith("http://") &&
!urlstring.startsWith("https://")) { urlstring = "http://" + urlstring; } !urlstring.startsWith("https://")) { urlstring = "http://" + urlstring; }
@ -166,9 +167,7 @@ public class IndexControl_p {
} }
} }
if (delurlref) { if (delurlref) {
for (int i = 0; i < urlx.length; i++) try { for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true);
switchboard.removeAllUrlReferences(urlx[i], true);
} catch (IOException e) {}
} }
if (delurl || delurlref) { if (delurl || delurlref) {
for (int i = 0; i < urlx.length; i++) { for (int i = 0; i < urlx.length; i++) {
@ -188,9 +187,7 @@ public class IndexControl_p {
// delete selected URLs // delete selected URLs
if (post.containsKey("keyhashdelete")) { if (post.containsKey("keyhashdelete")) {
if (delurlref) { if (delurlref) {
for (int i = 0; i < urlx.length; i++) try { for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true);
switchboard.removeAllUrlReferences(urlx[i], true);
} catch (IOException e) {}
} }
if (delurl || delurlref) { if (delurl || delurlref) {
for (int i = 0; i < urlx.length; i++) { for (int i = 0; i < urlx.length; i++) {
@ -211,12 +208,12 @@ public class IndexControl_p {
} }
if (post.containsKey("urlhashdeleteall")) { if (post.containsKey("urlhashdeleteall")) {
try { //try {
int i = switchboard.removeAllUrlReferences(urlhash, true); int i = switchboard.removeAllUrlReferences(urlhash, true);
prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes."); prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
} catch (IOException e) { //} catch (IOException e) {
prop.put("result", "Deleted nothing because the url-hash could not be resolved"); // prop.put("result", "Deleted nothing because the url-hash could not be resolved");
} //}
} }
if (post.containsKey("urlhashdelete")) { if (post.containsKey("urlhashdelete")) {
@ -311,11 +308,7 @@ public class IndexControl_p {
while (hashIt.hasNext() && i < 256) { while (hashIt.hasNext() && i < 256) {
hash = (String) hashIt.next(); hash = (String) hashIt.next();
result.append("<a href=\"/IndexControl_p.html?") result.append("<a href=\"/IndexControl_p.html?")
.append("keystring=") .append("keyhash=").append(hash).append("&keyhashsearch=")
.append("&keyhash=").append(hash)
.append("&urlhash=")
.append("&urlstring=")
.append("&urlhashsearch=")
.append("\" class=\"tt\">").append(hash).append("</a> ") .append("\" class=\"tt\">").append(hash).append("</a> ")
.append(((i + 1) % 8 == 0) ? "<br>" : ""); .append(((i + 1) % 8 == 0) ? "<br>" : "");
i++; i++;
@ -326,12 +319,15 @@ public class IndexControl_p {
if (post.containsKey("urlstringsearch")) { if (post.containsKey("urlstringsearch")) {
try { try {
URL url = new URL(urlstring); URL url = new URL(urlstring);
urlhash = plasmaURL.urlHash(url); urlhash = plasmaURL.urlHash(url);
prop.put("urlhash", urlhash); prop.put("urlhash", urlhash);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
prop.put("result", genUrlProfile(switchboard, entry, urlhash)); prop.put("result", genUrlProfile(switchboard, entry, urlhash));
} catch (Exception e) { } catch (MalformedURLException e) {
prop.put("urlstring", "wrong url: " + urlstring); prop.put("urlstring", "bad url: " + urlstring);
prop.put("urlhash", "");
} catch (IOException e) {
prop.put("urlstring", "unknown url: " + urlstring);
prop.put("urlhash", ""); prop.put("urlhash", "");
} }
} }
@ -356,7 +352,10 @@ public class IndexControl_p {
int i = 0; int i = 0;
while (hashIt.hasNext() && i < 256) { while (hashIt.hasNext() && i < 256) {
hash = (String) hashIt.next(); hash = (String) hashIt.next();
result.append("<a href=\"/IndexControl_p.html?").append("keystring=").append("&keyhash=").append("&urlhash=").append(hash).append("&urlstring=").append("&urlhashsearch=").append("\" class=\"tt\">").append(hash).append("</a> ").append(((i + 1) % 8 == 0) ? "<br>" : ""); result.append("<a href=\"/IndexControl_p.html?")
.append("urlhash=").append(hash).append("&urlhashsearch=")
.append("\" class=\"tt\">").append(hash).append("</a> ")
.append(((i + 1) % 8 == 0) ? "<br>" : "");
i++; i++;
} }
prop.put("result", result.toString()); prop.put("result", result.toString());
@ -449,10 +448,10 @@ public class IndexControl_p {
final TreeMap tm = new TreeMap(); final TreeMap tm = new TreeMap();
while (en.hasNext()) { while (en.hasNext()) {
uh = ((plasmaWordIndexEntry)en.next()).getUrlHash(); uh = ((plasmaWordIndexEntry)en.next()).getUrlHash();
if (switchboard.urlPool.loadedURL.exists(uh)) { try {
us = switchboard.urlPool.loadedURL.getEntry(uh).url().toString(); us = switchboard.urlPool.loadedURL.getEntry(uh).url().toString();
tm.put(us, uh); tm.put(us, uh);
} else { } catch (IOException e) {
tm.put("", uh); tm.put("", uh);
} }
} }

@ -468,6 +468,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
// if the url cannot be found, this returns null // if the url cannot be found, this returns null
this.urlHash = urlHash; this.urlHash = urlHash;
byte[][] entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes()); byte[][] entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes());
if (entry == null) throw new IOException("url hash " + urlHash + " not found in LURL");
try { try {
if (entry != null) { if (entry != null) {
this.url = new URL(new String(entry[1]).trim()); this.url = new URL(new String(entry[1]).trim());

@ -1,6 +1,7 @@
package de.anomic.plasma; package de.anomic.plasma;
import java.io.File; import java.io.File;
import java.io.IOException;
import java.util.Iterator; import java.util.Iterator;
import java.util.Vector; import java.util.Vector;
@ -247,17 +248,16 @@ public class plasmaDbImporter extends Thread {
entryCounter++; entryCounter++;
importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.next(); importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.next();
String urlHash = importWordIdxEntry.getUrlHash(); String urlHash = importWordIdxEntry.getUrlHash();
if ((this.importUrlDB.exists(urlHash)) && (!this.homeUrlDB.exists(urlHash))) { if ((this.importUrlDB.exists(urlHash)) && (!this.homeUrlDB.exists(urlHash))) try {
urlCounter++;
// importing the new url // importing the new url
plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash); plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash);
urlCounter++;
this.homeUrlDB.newEntry(urlEntry); this.homeUrlDB.newEntry(urlEntry);
if (urlCounter % 500 == 0) { if (urlCounter % 500 == 0) {
this.log.logFine(urlCounter + " URLs processed so far."); this.log.logFine(urlCounter + " URLs processed so far.");
} }
} } catch (IOException e) {}
// adding word index entity to container // adding word index entity to container
newContainer.add(importWordIdxEntry,System.currentTimeMillis()); newContainer.add(importWordIdxEntry,System.currentTimeMillis());

@ -217,27 +217,31 @@ public final class plasmaSearchEvent {
profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_PRESORT, rcLocal.size()); profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_PRESORT, rcLocal.size());
profileLocal.startTimer(); profileLocal.startTimer();
plasmaSearchResult acc = new plasmaSearchResult(query); plasmaSearchResult acc = new plasmaSearchResult(query);
if (searchResult == null) return acc; // strange case where searchResult is not proper: acc is then empty if (searchResult == null) return acc; // strange case where searchResult is not proper: acc is then empty
if (searchResult.size() == 0) return acc; // case that we have nothing to do if (searchResult.size() == 0) return acc; // case that we have nothing to do
// start url-fetch // start url-fetch
plasmaWordIndexEntry entry; plasmaWordIndexEntry entry;
long postorderLimitTime = (postorderTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + postorderTime; long postorderLimitTime = (postorderTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + postorderTime;
plasmaCrawlLURL.Entry page; plasmaCrawlLURL.Entry page;
int minEntries = profileLocal.getTargetCount(plasmaSearchProfile.PROCESS_POSTSORT); int minEntries = profileLocal.getTargetCount(plasmaSearchProfile.PROCESS_POSTSORT);
try { try {
while (preorder.hasNext()) { while (preorder.hasNext()) {
if ((acc.sizeFetched() >= minEntries) && (System.currentTimeMillis() >= postorderLimitTime)) break; if ((acc.sizeFetched() >= minEntries) && (System.currentTimeMillis() >= postorderLimitTime)) break;
entry = preorder.next(); entry = preorder.next();
// find the url entry // find the url entry
page = urlStore.getEntry(entry.getUrlHash()); try {
// add a result page = urlStore.getEntry(entry.getUrlHash());
acc.addResult(entry, page); // add a result
} acc.addResult(entry, page);
} catch (kelondroException ee) { } catch (IOException e) {
serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee); // result was not found
} }
}
} catch (kelondroException ee) {
serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee);
}
profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_URLFETCH); profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_URLFETCH);
profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_URLFETCH, acc.sizeFetched()); profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_URLFETCH, acc.sizeFetched());

@ -1221,12 +1221,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// create index // create index
String descr = document.getMainLongTitle(); String descr = document.getMainLongTitle();
String referrerHash; String referrerHash;
try { URL referrerURL = entry.referrerURL();
URL referrerURL = entry.referrerURL(); referrerHash = plasmaURL.urlHash(referrerURL);
referrerHash = plasmaURL.urlHash(referrerURL); if (referrerHash == null) referrerHash = plasmaURL.dummyHash;
} catch (IOException e) {
referrerHash = plasmaURL.dummyHash;
}
String noIndexReason = "unspecified"; String noIndexReason = "unspecified";
if (processCase == 4) { if (processCase == 4) {
// proxy-load // proxy-load
@ -1825,26 +1823,32 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
// method for index deletion // method for index deletion
public int removeAllUrlReferences(URL url, boolean fetchOnline) throws IOException { public int removeAllUrlReferences(URL url, boolean fetchOnline) {
return removeAllUrlReferences(plasmaURL.urlHash(url), fetchOnline); return removeAllUrlReferences(plasmaURL.urlHash(url), fetchOnline);
} }
public int removeAllUrlReferences(String urlhash, boolean fetchOnline) throws IOException { public int removeAllUrlReferences(String urlhash, boolean fetchOnline) {
// find all the words in a specific resource and remove the url reference from every word index // find all the words in a specific resource and remove the url reference from every word index
// finally, delete the url entry // finally, delete the url entry
// determine the url string // determine the url string
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.getEntry(urlhash); try {
URL url = entry.url(); plasmaCrawlLURL.Entry entry = urlPool.loadedURL.getEntry(urlhash);
if (url == null) return 0; URL url = entry.url();
// get set of words if (url == null)
//Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline))); return 0;
Set words = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline)).getText()); // get set of words
// delete all word references // Set words = plasmaCondenser.getWords(getText(getResource(url,
int count = removeReferences(urlhash, words); // fetchOnline)));
// finally delete the url entry itself Set words = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline)).getText());
urlPool.loadedURL.remove(urlhash); // delete all word references
return count; int count = removeReferences(urlhash, words);
// finally delete the url entry itself
urlPool.loadedURL.remove(urlhash);
return count;
} catch (IOException e) {
return 0;
}
} }
public int removeReferences(URL url, Set words) { public int removeReferences(URL url, Set words) {

@ -286,10 +286,15 @@ public class plasmaSwitchboardQueue {
return responseHeader; return responseHeader;
} }
public URL referrerURL() throws IOException { public URL referrerURL() {
if (referrerURL == null) { if (referrerURL == null) {
if ((referrerHash == null) || (referrerHash.equals(plasmaURL.dummyHash))) return null; if ((referrerHash == null) || (referrerHash.equals(plasmaURL.dummyHash))) return null;
referrerURL = lurls.getEntry(referrerHash).url(); try {
referrerURL = lurls.getEntry(referrerHash).url();
} catch (IOException e) {
referrerURL = null;
return null;
}
} }
return referrerURL; return referrerURL;
} }

@ -473,6 +473,7 @@ public class plasmaURL {
} }
public static final String urlHash(String url) { public static final String urlHash(String url) {
if ((url == null) || (url.length() == 0)) return null;
try { try {
return urlHash(new URL(url)); return urlHash(new URL(url));
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
@ -542,7 +543,7 @@ public class plasmaURL {
} }
public Iterator urlHashes(String urlHash, boolean up) { public Iterator urlHashes(String urlHash, boolean up) {
return urlHashCache.rows(up, false, urlHash.getBytes()); return urlHashCache.keys(up, false, urlHash.getBytes());
} }
} }

@ -75,8 +75,10 @@ public class plasmaURLPool {
if (urlhash.equals(plasmaURL.dummyHash)) return null; if (urlhash.equals(plasmaURL.dummyHash)) return null;
plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash); plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
if (ne != null) return ne.url(); if (ne != null) return ne.url();
plasmaCrawlLURL.Entry le = loadedURL.getEntry(urlhash); try {
if (le != null) return le.url(); plasmaCrawlLURL.Entry le = loadedURL.getEntry(urlhash);
if (le != null) return le.url();
} catch (IOException e) {}
plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash); plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);
if (ee != null) return ee.url(); if (ee != null) return ee.url();
return null; return null;

@ -330,12 +330,16 @@ public final class plasmaWordIndexDistribution {
urlIter = indexEntity.elements(true); urlIter = indexEntity.elements(true);
unknownURLEntries.clear(); unknownURLEntries.clear();
while (urlIter.hasNext()) { while (urlIter.hasNext()) {
indexEntry = (plasmaWordIndexEntry) urlIter.next(); indexEntry = (plasmaWordIndexEntry) urlIter.next();
lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash()); try {
if ((lurl == null) || (lurl.url() == null)) { lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash());
if ((lurl == null) || (lurl.url() == null)) {
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
knownURLs.put(indexEntry.getUrlHash(), lurl);
}
} catch (IOException e) {
unknownURLEntries.add(indexEntry.getUrlHash()); unknownURLEntries.add(indexEntry.getUrlHash());
} else {
knownURLs.put(indexEntry.getUrlHash(), lurl);
} }
} }
// now delete all entries that have no url entry // now delete all entries that have no url entry
@ -367,13 +371,17 @@ public final class plasmaWordIndexDistribution {
unknownURLEntries.clear(); unknownURLEntries.clear();
while ((urlIter.hasNext()) && (count > 0)) { while ((urlIter.hasNext()) && (count > 0)) {
indexEntry = (plasmaWordIndexEntry) urlIter.next(); indexEntry = (plasmaWordIndexEntry) urlIter.next();
lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash()); try {
if ((lurl == null) || (lurl.url()==null)) { lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash());
if ((lurl == null) || (lurl.url()==null)) {
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
knownURLs.put(indexEntry.getUrlHash(), lurl);
tmpEntity.addEntry(indexEntry);
count--;
}
} catch (IOException e) {
unknownURLEntries.add(indexEntry.getUrlHash()); unknownURLEntries.add(indexEntry.getUrlHash());
} else {
knownURLs.put(indexEntry.getUrlHash(), lurl);
tmpEntity.addEntry(indexEntry);
count--;
} }
} }
// now delete all entries that have no url entry // now delete all entries that have no url entry

@ -166,6 +166,7 @@ public final class serverDate {
if (utime < utimeyearsacc[years]) years--; // the correction if (utime < utimeyearsacc[years]) years--; // the correction
long remain = utime - utimeyearsacc[years]; long remain = utime - utimeyearsacc[years];
months = (int) (remain / (29 * dayMillis)); // a guess months = (int) (remain / (29 * dayMillis)); // a guess
if (months > 11) months = 11;
if ((years & 3) == 0) { if ((years & 3) == 0) {
if (remain < dimleapacc[months]) months--; // correction if (remain < dimleapacc[months]) months--; // correction
remain = remain - dimleapacc[months]; remain = remain - dimleapacc[months];

@ -797,17 +797,16 @@ public final class yacy {
entryCounter++; entryCounter++;
importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.next(); importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.next();
String urlHash = importWordIdxEntry.getUrlHash(); String urlHash = importWordIdxEntry.getUrlHash();
if ((importUrlDB.exists(urlHash)) && (!homeUrlDB.exists(urlHash))) { if ((importUrlDB.exists(urlHash)) && (!homeUrlDB.exists(urlHash))) try {
urlCounter++;
// importing the new url // importing the new url
plasmaCrawlLURL.Entry urlEntry = importUrlDB.getEntry(urlHash); plasmaCrawlLURL.Entry urlEntry = importUrlDB.getEntry(urlHash);
urlCounter++;
homeUrlDB.newEntry(urlEntry); homeUrlDB.newEntry(urlEntry);
if (urlCounter % 500 == 0) { if (urlCounter % 500 == 0) {
log.logFine(urlCounter + " URLs processed so far."); log.logFine(urlCounter + " URLs processed so far.");
} }
} } catch (IOException e) {}
// adding word index entity to container // adding word index entity to container
newContainer.add(importWordIdxEntry,System.currentTimeMillis()); newContainer.add(importWordIdxEntry,System.currentTimeMillis());
@ -906,14 +905,14 @@ public final class yacy {
while (wordIdxEntries.hasNext()) { while (wordIdxEntries.hasNext()) {
wordIdxEntry = (plasmaWordIndexEntry) wordIdxEntries.next(); wordIdxEntry = (plasmaWordIndexEntry) wordIdxEntries.next();
String urlHash = wordIdxEntry.getUrlHash(); String urlHash = wordIdxEntry.getUrlHash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) { if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
urlCounter++;
plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash); plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash);
urlCounter++;
/*plasmaCrawlLURL.Entry newEntry =*/ minimizedUrlDB.newEntry(urlEntry); /*plasmaCrawlLURL.Entry newEntry =*/ minimizedUrlDB.newEntry(urlEntry);
if (urlCounter % 500 == 0) { if (urlCounter % 500 == 0) {
log.logInfo(urlCounter + " URLs found so far."); log.logInfo(urlCounter + " URLs found so far.");
} }
} } catch (IOException e) {}
} }
// we have read all elements, now we can close it // we have read all elements, now we can close it
wordIdxEntity.close(); wordIdxEntity = null; wordIdxEntity.close(); wordIdxEntity = null;

Loading…
Cancel
Save