fixed some bugs concerning url entry retrieval and intexControl interface

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1212 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 83a34b838d
commit 4500506735

@ -97,7 +97,7 @@ public class CacheAdmin_p {
// generate sorted dir/file listing
final String[] list = dir.list();
final StringBuffer tree = new StringBuffer((list.length + 2) * 256);
final StringBuffer tree = new StringBuffer((list == null) ? 70 : (list.length + 2) * 256);
tree.append("Directory of<br>").append((pathString.length() == 0) ? "domain list" : linkPathString(pathString)).append("<br><br>");
if (list == null) {
tree.append("[empty]");

@ -47,6 +47,7 @@
// if the shell's current path is HTROOT
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Enumeration;
import java.util.HashSet;
@ -91,10 +92,10 @@ public class IndexControl_p {
}
// default values
String keystring = ((String) post.get("keystring")).trim();
String keyhash = ((String) post.get("keyhash")).trim();
String urlstring = ((String) post.get("urlstring")).trim();
String urlhash = ((String) post.get("urlhash")).trim();
String keystring = ((String) post.get("keystring", "")).trim();
String keyhash = ((String) post.get("keyhash", "")).trim();
String urlstring = ((String) post.get("urlstring", "")).trim();
String urlhash = ((String) post.get("urlhash", "")).trim();
if (!urlstring.startsWith("http://") &&
!urlstring.startsWith("https://")) { urlstring = "http://" + urlstring; }
@ -166,9 +167,7 @@ public class IndexControl_p {
}
}
if (delurlref) {
for (int i = 0; i < urlx.length; i++) try {
switchboard.removeAllUrlReferences(urlx[i], true);
} catch (IOException e) {}
for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true);
}
if (delurl || delurlref) {
for (int i = 0; i < urlx.length; i++) {
@ -188,9 +187,7 @@ public class IndexControl_p {
// delete selected URLs
if (post.containsKey("keyhashdelete")) {
if (delurlref) {
for (int i = 0; i < urlx.length; i++) try {
switchboard.removeAllUrlReferences(urlx[i], true);
} catch (IOException e) {}
for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true);
}
if (delurl || delurlref) {
for (int i = 0; i < urlx.length; i++) {
@ -211,12 +208,12 @@ public class IndexControl_p {
}
if (post.containsKey("urlhashdeleteall")) {
try {
//try {
int i = switchboard.removeAllUrlReferences(urlhash, true);
prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
} catch (IOException e) {
prop.put("result", "Deleted nothing because the url-hash could not be resolved");
}
//} catch (IOException e) {
// prop.put("result", "Deleted nothing because the url-hash could not be resolved");
//}
}
if (post.containsKey("urlhashdelete")) {
@ -311,11 +308,7 @@ public class IndexControl_p {
while (hashIt.hasNext() && i < 256) {
hash = (String) hashIt.next();
result.append("<a href=\"/IndexControl_p.html?")
.append("keystring=")
.append("&keyhash=").append(hash)
.append("&urlhash=")
.append("&urlstring=")
.append("&urlhashsearch=")
.append("keyhash=").append(hash).append("&keyhashsearch=")
.append("\" class=\"tt\">").append(hash).append("</a> ")
.append(((i + 1) % 8 == 0) ? "<br>" : "");
i++;
@ -330,8 +323,11 @@ public class IndexControl_p {
prop.put("urlhash", urlhash);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
prop.put("result", genUrlProfile(switchboard, entry, urlhash));
} catch (Exception e) {
prop.put("urlstring", "wrong url: " + urlstring);
} catch (MalformedURLException e) {
prop.put("urlstring", "bad url: " + urlstring);
prop.put("urlhash", "");
} catch (IOException e) {
prop.put("urlstring", "unknown url: " + urlstring);
prop.put("urlhash", "");
}
}
@ -356,7 +352,10 @@ public class IndexControl_p {
int i = 0;
while (hashIt.hasNext() && i < 256) {
hash = (String) hashIt.next();
result.append("<a href=\"/IndexControl_p.html?").append("keystring=").append("&keyhash=").append("&urlhash=").append(hash).append("&urlstring=").append("&urlhashsearch=").append("\" class=\"tt\">").append(hash).append("</a> ").append(((i + 1) % 8 == 0) ? "<br>" : "");
result.append("<a href=\"/IndexControl_p.html?")
.append("urlhash=").append(hash).append("&urlhashsearch=")
.append("\" class=\"tt\">").append(hash).append("</a> ")
.append(((i + 1) % 8 == 0) ? "<br>" : "");
i++;
}
prop.put("result", result.toString());
@ -449,10 +448,10 @@ public class IndexControl_p {
final TreeMap tm = new TreeMap();
while (en.hasNext()) {
uh = ((plasmaWordIndexEntry)en.next()).getUrlHash();
if (switchboard.urlPool.loadedURL.exists(uh)) {
try {
us = switchboard.urlPool.loadedURL.getEntry(uh).url().toString();
tm.put(us, uh);
} else {
} catch (IOException e) {
tm.put("", uh);
}
}

@ -468,6 +468,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
// if the url cannot be found, this returns null
this.urlHash = urlHash;
byte[][] entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes());
if (entry == null) throw new IOException("url hash " + urlHash + " not found in LURL");
try {
if (entry != null) {
this.url = new URL(new String(entry[1]).trim());

@ -1,6 +1,7 @@
package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.Vector;
@ -247,17 +248,16 @@ public class plasmaDbImporter extends Thread {
entryCounter++;
importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.next();
String urlHash = importWordIdxEntry.getUrlHash();
if ((this.importUrlDB.exists(urlHash)) && (!this.homeUrlDB.exists(urlHash))) {
urlCounter++;
if ((this.importUrlDB.exists(urlHash)) && (!this.homeUrlDB.exists(urlHash))) try {
// importing the new url
plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash);
urlCounter++;
this.homeUrlDB.newEntry(urlEntry);
if (urlCounter % 500 == 0) {
this.log.logFine(urlCounter + " URLs processed so far.");
}
}
} catch (IOException e) {}
// adding word index entity to container
newContainer.add(importWordIdxEntry,System.currentTimeMillis());

@ -231,9 +231,13 @@ public final class plasmaSearchEvent {
if ((acc.sizeFetched() >= minEntries) && (System.currentTimeMillis() >= postorderLimitTime)) break;
entry = preorder.next();
// find the url entry
try {
page = urlStore.getEntry(entry.getUrlHash());
// add a result
acc.addResult(entry, page);
} catch (IOException e) {
// result was not found
}
}
} catch (kelondroException ee) {
serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee);

@ -1221,12 +1221,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// create index
String descr = document.getMainLongTitle();
String referrerHash;
try {
URL referrerURL = entry.referrerURL();
referrerHash = plasmaURL.urlHash(referrerURL);
} catch (IOException e) {
referrerHash = plasmaURL.dummyHash;
}
if (referrerHash == null) referrerHash = plasmaURL.dummyHash;
String noIndexReason = "unspecified";
if (processCase == 4) {
// proxy-load
@ -1825,26 +1823,32 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// method for index deletion
public int removeAllUrlReferences(URL url, boolean fetchOnline) throws IOException {
public int removeAllUrlReferences(URL url, boolean fetchOnline) {
return removeAllUrlReferences(plasmaURL.urlHash(url), fetchOnline);
}
public int removeAllUrlReferences(String urlhash, boolean fetchOnline) throws IOException {
public int removeAllUrlReferences(String urlhash, boolean fetchOnline) {
// find all the words in a specific resource and remove the url reference from every word index
// finally, delete the url entry
// determine the url string
try {
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.getEntry(urlhash);
URL url = entry.url();
if (url == null) return 0;
if (url == null)
return 0;
// get set of words
//Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
// Set words = plasmaCondenser.getWords(getText(getResource(url,
// fetchOnline)));
Set words = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline)).getText());
// delete all word references
int count = removeReferences(urlhash, words);
// finally delete the url entry itself
urlPool.loadedURL.remove(urlhash);
return count;
} catch (IOException e) {
return 0;
}
}
public int removeReferences(URL url, Set words) {

@ -286,10 +286,15 @@ public class plasmaSwitchboardQueue {
return responseHeader;
}
public URL referrerURL() throws IOException {
public URL referrerURL() {
if (referrerURL == null) {
if ((referrerHash == null) || (referrerHash.equals(plasmaURL.dummyHash))) return null;
try {
referrerURL = lurls.getEntry(referrerHash).url();
} catch (IOException e) {
referrerURL = null;
return null;
}
}
return referrerURL;
}

@ -473,6 +473,7 @@ public class plasmaURL {
}
public static final String urlHash(String url) {
if ((url == null) || (url.length() == 0)) return null;
try {
return urlHash(new URL(url));
} catch (MalformedURLException e) {
@ -542,7 +543,7 @@ public class plasmaURL {
}
public Iterator urlHashes(String urlHash, boolean up) {
return urlHashCache.rows(up, false, urlHash.getBytes());
return urlHashCache.keys(up, false, urlHash.getBytes());
}
}

@ -75,8 +75,10 @@ public class plasmaURLPool {
if (urlhash.equals(plasmaURL.dummyHash)) return null;
plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
if (ne != null) return ne.url();
try {
plasmaCrawlLURL.Entry le = loadedURL.getEntry(urlhash);
if (le != null) return le.url();
} catch (IOException e) {}
plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);
if (ee != null) return ee.url();
return null;

@ -331,12 +331,16 @@ public final class plasmaWordIndexDistribution {
unknownURLEntries.clear();
while (urlIter.hasNext()) {
indexEntry = (plasmaWordIndexEntry) urlIter.next();
try {
lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash());
if ((lurl == null) || (lurl.url() == null)) {
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
knownURLs.put(indexEntry.getUrlHash(), lurl);
}
} catch (IOException e) {
unknownURLEntries.add(indexEntry.getUrlHash());
}
}
// now delete all entries that have no url entry
hashIter = unknownURLEntries.iterator();
@ -367,6 +371,7 @@ public final class plasmaWordIndexDistribution {
unknownURLEntries.clear();
while ((urlIter.hasNext()) && (count > 0)) {
indexEntry = (plasmaWordIndexEntry) urlIter.next();
try {
lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash());
if ((lurl == null) || (lurl.url()==null)) {
unknownURLEntries.add(indexEntry.getUrlHash());
@ -375,6 +380,9 @@ public final class plasmaWordIndexDistribution {
tmpEntity.addEntry(indexEntry);
count--;
}
} catch (IOException e) {
unknownURLEntries.add(indexEntry.getUrlHash());
}
}
// now delete all entries that have no url entry
hashIter = unknownURLEntries.iterator();

@ -166,6 +166,7 @@ public final class serverDate {
if (utime < utimeyearsacc[years]) years--; // the correction
long remain = utime - utimeyearsacc[years];
months = (int) (remain / (29 * dayMillis)); // a guess
if (months > 11) months = 11;
if ((years & 3) == 0) {
if (remain < dimleapacc[months]) months--; // correction
remain = remain - dimleapacc[months];

@ -797,17 +797,16 @@ public final class yacy {
entryCounter++;
importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.next();
String urlHash = importWordIdxEntry.getUrlHash();
if ((importUrlDB.exists(urlHash)) && (!homeUrlDB.exists(urlHash))) {
urlCounter++;
if ((importUrlDB.exists(urlHash)) && (!homeUrlDB.exists(urlHash))) try {
// importing the new url
plasmaCrawlLURL.Entry urlEntry = importUrlDB.getEntry(urlHash);
urlCounter++;
homeUrlDB.newEntry(urlEntry);
if (urlCounter % 500 == 0) {
log.logFine(urlCounter + " URLs processed so far.");
}
}
} catch (IOException e) {}
// adding word index entity to container
newContainer.add(importWordIdxEntry,System.currentTimeMillis());
@ -906,14 +905,14 @@ public final class yacy {
while (wordIdxEntries.hasNext()) {
wordIdxEntry = (plasmaWordIndexEntry) wordIdxEntries.next();
String urlHash = wordIdxEntry.getUrlHash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) {
urlCounter++;
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash);
urlCounter++;
/*plasmaCrawlLURL.Entry newEntry =*/ minimizedUrlDB.newEntry(urlEntry);
if (urlCounter % 500 == 0) {
log.logInfo(urlCounter + " URLs found so far.");
}
}
} catch (IOException e) {}
}
// we have read all elements, now we can close it
wordIdxEntity.close(); wordIdxEntity = null;

Loading…
Cancel
Save