From 45005067351e7ceba20b748757149b48aee9351d Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 15 Dec 2005 10:31:00 +0000 Subject: [PATCH] fixed some bugs concerning url entry retrieval and intexControl interface git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1212 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CacheAdmin_p.java | 2 +- htroot/IndexControl_p.java | 51 +++++++++---------- source/de/anomic/plasma/plasmaCrawlLURL.java | 1 + source/de/anomic/plasma/plasmaDbImporter.java | 8 +-- .../de/anomic/plasma/plasmaSearchEvent.java | 28 +++++----- .../de/anomic/plasma/plasmaSwitchboard.java | 42 ++++++++------- .../anomic/plasma/plasmaSwitchboardQueue.java | 9 +++- source/de/anomic/plasma/plasmaURL.java | 3 +- source/de/anomic/plasma/plasmaURLPool.java | 6 ++- .../plasma/plasmaWordIndexDistribution.java | 30 +++++++---- source/de/anomic/server/serverDate.java | 1 + source/yacy.java | 13 +++-- 12 files changed, 109 insertions(+), 85 deletions(-) diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java index e1af6c413..cbaf95ac4 100644 --- a/htroot/CacheAdmin_p.java +++ b/htroot/CacheAdmin_p.java @@ -97,7 +97,7 @@ public class CacheAdmin_p { // generate sorted dir/file listing final String[] list = dir.list(); - final StringBuffer tree = new StringBuffer((list.length + 2) * 256); + final StringBuffer tree = new StringBuffer((list == null) ? 70 : (list.length + 2) * 256); tree.append("Directory of
").append((pathString.length() == 0) ? "domain list" : linkPathString(pathString)).append("

"); if (list == null) { tree.append("[empty]"); diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 2fb4389f4..79126b664 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -47,6 +47,7 @@ // if the shell's current path is HTROOT import java.io.IOException; +import java.net.MalformedURLException; import java.net.URL; import java.util.Enumeration; import java.util.HashSet; @@ -91,10 +92,10 @@ public class IndexControl_p { } // default values - String keystring = ((String) post.get("keystring")).trim(); - String keyhash = ((String) post.get("keyhash")).trim(); - String urlstring = ((String) post.get("urlstring")).trim(); - String urlhash = ((String) post.get("urlhash")).trim(); + String keystring = ((String) post.get("keystring", "")).trim(); + String keyhash = ((String) post.get("keyhash", "")).trim(); + String urlstring = ((String) post.get("urlstring", "")).trim(); + String urlhash = ((String) post.get("urlhash", "")).trim(); if (!urlstring.startsWith("http://") && !urlstring.startsWith("https://")) { urlstring = "http://" + urlstring; } @@ -166,9 +167,7 @@ public class IndexControl_p { } } if (delurlref) { - for (int i = 0; i < urlx.length; i++) try { - switchboard.removeAllUrlReferences(urlx[i], true); - } catch (IOException e) {} + for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true); } if (delurl || delurlref) { for (int i = 0; i < urlx.length; i++) { @@ -188,9 +187,7 @@ public class IndexControl_p { // delete selected URLs if (post.containsKey("keyhashdelete")) { if (delurlref) { - for (int i = 0; i < urlx.length; i++) try { - switchboard.removeAllUrlReferences(urlx[i], true); - } catch (IOException e) {} + for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true); } if (delurl || delurlref) { for (int i = 0; i < urlx.length; i++) { @@ -211,12 +208,12 @@ public class IndexControl_p { } if (post.containsKey("urlhashdeleteall")) { - try { + //try { int i = switchboard.removeAllUrlReferences(urlhash, true); prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes."); - } catch (IOException e) { - prop.put("result", "Deleted nothing because the url-hash could not be resolved"); - } + //} catch (IOException e) { + // prop.put("result", "Deleted nothing because the url-hash could not be resolved"); + //} } if (post.containsKey("urlhashdelete")) { @@ -311,11 +308,7 @@ public class IndexControl_p { while (hashIt.hasNext() && i < 256) { hash = (String) hashIt.next(); result.append("").append(hash).append(" ") .append(((i + 1) % 8 == 0) ? "
" : ""); i++; @@ -326,12 +319,15 @@ public class IndexControl_p { if (post.containsKey("urlstringsearch")) { try { URL url = new URL(urlstring); - urlhash = plasmaURL.urlHash(url); - prop.put("urlhash", urlhash); + urlhash = plasmaURL.urlHash(url); + prop.put("urlhash", urlhash); plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); prop.put("result", genUrlProfile(switchboard, entry, urlhash)); - } catch (Exception e) { - prop.put("urlstring", "wrong url: " + urlstring); + } catch (MalformedURLException e) { + prop.put("urlstring", "bad url: " + urlstring); + prop.put("urlhash", ""); + } catch (IOException e) { + prop.put("urlstring", "unknown url: " + urlstring); prop.put("urlhash", ""); } } @@ -356,7 +352,10 @@ public class IndexControl_p { int i = 0; while (hashIt.hasNext() && i < 256) { hash = (String) hashIt.next(); - result.append("").append(hash).append(" ").append(((i + 1) % 8 == 0) ? "
" : ""); + result.append("").append(hash).append(" ") + .append(((i + 1) % 8 == 0) ? "
" : ""); i++; } prop.put("result", result.toString()); @@ -449,10 +448,10 @@ public class IndexControl_p { final TreeMap tm = new TreeMap(); while (en.hasNext()) { uh = ((plasmaWordIndexEntry)en.next()).getUrlHash(); - if (switchboard.urlPool.loadedURL.exists(uh)) { + try { us = switchboard.urlPool.loadedURL.getEntry(uh).url().toString(); tm.put(us, uh); - } else { + } catch (IOException e) { tm.put("", uh); } } diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index b434ec579..d2a13a6bd 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -468,6 +468,7 @@ public final class plasmaCrawlLURL extends plasmaURL { // if the url cannot be found, this returns null this.urlHash = urlHash; byte[][] entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes()); + if (entry == null) throw new IOException("url hash " + urlHash + " not found in LURL"); try { if (entry != null) { this.url = new URL(new String(entry[1]).trim()); diff --git a/source/de/anomic/plasma/plasmaDbImporter.java b/source/de/anomic/plasma/plasmaDbImporter.java index 9545906cb..e6170b00a 100644 --- a/source/de/anomic/plasma/plasmaDbImporter.java +++ b/source/de/anomic/plasma/plasmaDbImporter.java @@ -1,6 +1,7 @@ package de.anomic.plasma; import java.io.File; +import java.io.IOException; import java.util.Iterator; import java.util.Vector; @@ -247,17 +248,16 @@ public class plasmaDbImporter extends Thread { entryCounter++; importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.next(); String urlHash = importWordIdxEntry.getUrlHash(); - if ((this.importUrlDB.exists(urlHash)) && (!this.homeUrlDB.exists(urlHash))) { - urlCounter++; - + if ((this.importUrlDB.exists(urlHash)) && (!this.homeUrlDB.exists(urlHash))) try { // importing the new url plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash); + urlCounter++; this.homeUrlDB.newEntry(urlEntry); if (urlCounter % 500 == 0) { this.log.logFine(urlCounter + " URLs processed so far."); } - } + } catch (IOException e) {} // adding word index entity to container newContainer.add(importWordIdxEntry,System.currentTimeMillis()); diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index a01130e21..92f1b5b6d 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -217,27 +217,31 @@ public final class plasmaSearchEvent { profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_PRESORT, rcLocal.size()); profileLocal.startTimer(); - plasmaSearchResult acc = new plasmaSearchResult(query); - if (searchResult == null) return acc; // strange case where searchResult is not proper: acc is then empty + plasmaSearchResult acc = new plasmaSearchResult(query); + if (searchResult == null) return acc; // strange case where searchResult is not proper: acc is then empty if (searchResult.size() == 0) return acc; // case that we have nothing to do // start url-fetch - plasmaWordIndexEntry entry; + plasmaWordIndexEntry entry; long postorderLimitTime = (postorderTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + postorderTime; plasmaCrawlLURL.Entry page; int minEntries = profileLocal.getTargetCount(plasmaSearchProfile.PROCESS_POSTSORT); - try { - while (preorder.hasNext()) { + try { + while (preorder.hasNext()) { if ((acc.sizeFetched() >= minEntries) && (System.currentTimeMillis() >= postorderLimitTime)) break; entry = preorder.next(); // find the url entry - page = urlStore.getEntry(entry.getUrlHash()); - // add a result - acc.addResult(entry, page); - } - } catch (kelondroException ee) { - serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee); - } + try { + page = urlStore.getEntry(entry.getUrlHash()); + // add a result + acc.addResult(entry, page); + } catch (IOException e) { + // result was not found + } + } + } catch (kelondroException ee) { + serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee); + } profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_URLFETCH); profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_URLFETCH, acc.sizeFetched()); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 2b038ab45..2d117e851 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1221,12 +1221,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // create index String descr = document.getMainLongTitle(); String referrerHash; - try { - URL referrerURL = entry.referrerURL(); - referrerHash = plasmaURL.urlHash(referrerURL); - } catch (IOException e) { - referrerHash = plasmaURL.dummyHash; - } + URL referrerURL = entry.referrerURL(); + referrerHash = plasmaURL.urlHash(referrerURL); + if (referrerHash == null) referrerHash = plasmaURL.dummyHash; + String noIndexReason = "unspecified"; if (processCase == 4) { // proxy-load @@ -1825,26 +1823,32 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // method for index deletion - public int removeAllUrlReferences(URL url, boolean fetchOnline) throws IOException { + public int removeAllUrlReferences(URL url, boolean fetchOnline) { return removeAllUrlReferences(plasmaURL.urlHash(url), fetchOnline); } - public int removeAllUrlReferences(String urlhash, boolean fetchOnline) throws IOException { + public int removeAllUrlReferences(String urlhash, boolean fetchOnline) { // find all the words in a specific resource and remove the url reference from every word index // finally, delete the url entry // determine the url string - plasmaCrawlLURL.Entry entry = urlPool.loadedURL.getEntry(urlhash); - URL url = entry.url(); - if (url == null) return 0; - // get set of words - //Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline))); - Set words = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline)).getText()); - // delete all word references - int count = removeReferences(urlhash, words); - // finally delete the url entry itself - urlPool.loadedURL.remove(urlhash); - return count; + try { + plasmaCrawlLURL.Entry entry = urlPool.loadedURL.getEntry(urlhash); + URL url = entry.url(); + if (url == null) + return 0; + // get set of words + // Set words = plasmaCondenser.getWords(getText(getResource(url, + // fetchOnline))); + Set words = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline)).getText()); + // delete all word references + int count = removeReferences(urlhash, words); + // finally delete the url entry itself + urlPool.loadedURL.remove(urlhash); + return count; + } catch (IOException e) { + return 0; + } } public int removeReferences(URL url, Set words) { diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java index aaf5ca73f..84a9746e1 100644 --- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java +++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java @@ -286,10 +286,15 @@ public class plasmaSwitchboardQueue { return responseHeader; } - public URL referrerURL() throws IOException { + public URL referrerURL() { if (referrerURL == null) { if ((referrerHash == null) || (referrerHash.equals(plasmaURL.dummyHash))) return null; - referrerURL = lurls.getEntry(referrerHash).url(); + try { + referrerURL = lurls.getEntry(referrerHash).url(); + } catch (IOException e) { + referrerURL = null; + return null; + } } return referrerURL; } diff --git a/source/de/anomic/plasma/plasmaURL.java b/source/de/anomic/plasma/plasmaURL.java index 0c8fdaf85..fe4d9bec3 100644 --- a/source/de/anomic/plasma/plasmaURL.java +++ b/source/de/anomic/plasma/plasmaURL.java @@ -473,6 +473,7 @@ public class plasmaURL { } public static final String urlHash(String url) { + if ((url == null) || (url.length() == 0)) return null; try { return urlHash(new URL(url)); } catch (MalformedURLException e) { @@ -542,7 +543,7 @@ public class plasmaURL { } public Iterator urlHashes(String urlHash, boolean up) { - return urlHashCache.rows(up, false, urlHash.getBytes()); + return urlHashCache.keys(up, false, urlHash.getBytes()); } } diff --git a/source/de/anomic/plasma/plasmaURLPool.java b/source/de/anomic/plasma/plasmaURLPool.java index ed3d96465..dbd5e9a31 100644 --- a/source/de/anomic/plasma/plasmaURLPool.java +++ b/source/de/anomic/plasma/plasmaURLPool.java @@ -75,8 +75,10 @@ public class plasmaURLPool { if (urlhash.equals(plasmaURL.dummyHash)) return null; plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash); if (ne != null) return ne.url(); - plasmaCrawlLURL.Entry le = loadedURL.getEntry(urlhash); - if (le != null) return le.url(); + try { + plasmaCrawlLURL.Entry le = loadedURL.getEntry(urlhash); + if (le != null) return le.url(); + } catch (IOException e) {} plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash); if (ee != null) return ee.url(); return null; diff --git a/source/de/anomic/plasma/plasmaWordIndexDistribution.java b/source/de/anomic/plasma/plasmaWordIndexDistribution.java index b1ff11f12..f32409ebd 100644 --- a/source/de/anomic/plasma/plasmaWordIndexDistribution.java +++ b/source/de/anomic/plasma/plasmaWordIndexDistribution.java @@ -330,12 +330,16 @@ public final class plasmaWordIndexDistribution { urlIter = indexEntity.elements(true); unknownURLEntries.clear(); while (urlIter.hasNext()) { - indexEntry = (plasmaWordIndexEntry) urlIter.next(); - lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash()); - if ((lurl == null) || (lurl.url() == null)) { + indexEntry = (plasmaWordIndexEntry) urlIter.next(); + try { + lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash()); + if ((lurl == null) || (lurl.url() == null)) { + unknownURLEntries.add(indexEntry.getUrlHash()); + } else { + knownURLs.put(indexEntry.getUrlHash(), lurl); + } + } catch (IOException e) { unknownURLEntries.add(indexEntry.getUrlHash()); - } else { - knownURLs.put(indexEntry.getUrlHash(), lurl); } } // now delete all entries that have no url entry @@ -367,13 +371,17 @@ public final class plasmaWordIndexDistribution { unknownURLEntries.clear(); while ((urlIter.hasNext()) && (count > 0)) { indexEntry = (plasmaWordIndexEntry) urlIter.next(); - lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash()); - if ((lurl == null) || (lurl.url()==null)) { + try { + lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash()); + if ((lurl == null) || (lurl.url()==null)) { + unknownURLEntries.add(indexEntry.getUrlHash()); + } else { + knownURLs.put(indexEntry.getUrlHash(), lurl); + tmpEntity.addEntry(indexEntry); + count--; + } + } catch (IOException e) { unknownURLEntries.add(indexEntry.getUrlHash()); - } else { - knownURLs.put(indexEntry.getUrlHash(), lurl); - tmpEntity.addEntry(indexEntry); - count--; } } // now delete all entries that have no url entry diff --git a/source/de/anomic/server/serverDate.java b/source/de/anomic/server/serverDate.java index db2ccb70f..5fc05866d 100644 --- a/source/de/anomic/server/serverDate.java +++ b/source/de/anomic/server/serverDate.java @@ -166,6 +166,7 @@ public final class serverDate { if (utime < utimeyearsacc[years]) years--; // the correction long remain = utime - utimeyearsacc[years]; months = (int) (remain / (29 * dayMillis)); // a guess + if (months > 11) months = 11; if ((years & 3) == 0) { if (remain < dimleapacc[months]) months--; // correction remain = remain - dimleapacc[months]; diff --git a/source/yacy.java b/source/yacy.java index 62147f23f..e4cc57b79 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -797,17 +797,16 @@ public final class yacy { entryCounter++; importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.next(); String urlHash = importWordIdxEntry.getUrlHash(); - if ((importUrlDB.exists(urlHash)) && (!homeUrlDB.exists(urlHash))) { - urlCounter++; - + if ((importUrlDB.exists(urlHash)) && (!homeUrlDB.exists(urlHash))) try { // importing the new url plasmaCrawlLURL.Entry urlEntry = importUrlDB.getEntry(urlHash); + urlCounter++; homeUrlDB.newEntry(urlEntry); if (urlCounter % 500 == 0) { log.logFine(urlCounter + " URLs processed so far."); } - } + } catch (IOException e) {} // adding word index entity to container newContainer.add(importWordIdxEntry,System.currentTimeMillis()); @@ -906,14 +905,14 @@ public final class yacy { while (wordIdxEntries.hasNext()) { wordIdxEntry = (plasmaWordIndexEntry) wordIdxEntries.next(); String urlHash = wordIdxEntry.getUrlHash(); - if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) { - urlCounter++; + if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try { plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash); + urlCounter++; /*plasmaCrawlLURL.Entry newEntry =*/ minimizedUrlDB.newEntry(urlEntry); if (urlCounter % 500 == 0) { log.logInfo(urlCounter + " URLs found so far."); } - } + } catch (IOException e) {} } // we have read all elements, now we can close it wordIdxEntity.close(); wordIdxEntity = null;