*) Fixing Problems with MalformedURLs during Word Selection

- removing (lurl.toString() == null) comparison because toString() is never null
   - adding (lurl.url() == null) condition because url() is null if we have selected a word entry with
     a malformed URL

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1083 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent 0610ff4fe9
commit e6bf9d90a5

@ -327,21 +327,18 @@ public final class plasmaWordIndexDistribution {
while (urlIter.hasNext()) {
indexEntry = (plasmaWordIndexEntry) urlIter.next();
lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash());
if ((lurl == null) || (lurl.toString() == null)) {
if ((lurl == null) || (lurl.url() == null)) {
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
if (lurl.toString() == null) {
this.urlPool.loadedURL.remove(indexEntry.getUrlHash());
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
knownURLs.put(indexEntry.getUrlHash(), lurl);
}
knownURLs.put(indexEntry.getUrlHash(), lurl);
}
}
// now delete all entries that have no url entry
hashIter = unknownURLEntries.iterator();
while (hashIter.hasNext()) {
indexEntity.removeEntry((String) hashIter.next(), false);
String nextUrlHash = (String) hashIter.next();
indexEntity.removeEntry(nextUrlHash, false);
this.urlPool.loadedURL.remove(nextUrlHash);
}
if (indexEntity.size() == 0) {
@ -366,23 +363,20 @@ public final class plasmaWordIndexDistribution {
while ((urlIter.hasNext()) && (count > 0)) {
indexEntry = (plasmaWordIndexEntry) urlIter.next();
lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash());
if (lurl == null) {
if ((lurl == null) || (lurl.url()==null)) {
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
if (lurl.toString() == null) {
this.urlPool.loadedURL.remove(indexEntry.getUrlHash());
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
knownURLs.put(indexEntry.getUrlHash(), lurl);
tmpEntity.addEntry(indexEntry);
count--;
}
knownURLs.put(indexEntry.getUrlHash(), lurl);
tmpEntity.addEntry(indexEntry);
count--;
}
}
// now delete all entries that have no url entry
hashIter = unknownURLEntries.iterator();
while (hashIter.hasNext()) {
indexEntity.removeEntry((String) hashIter.next(), true);
String nextUrlHash = (String) hashIter.next();
indexEntity.removeEntry(nextUrlHash, true);
this.urlPool.loadedURL.remove(nextUrlHash);
}
// use whats remaining
this.log.logFine("Selected partial index (" + tmpEntity.size() + " from " + indexEntity.size() +" URLs, " + unknownURLEntries.size() + " not bound) for word " + tmpEntity.wordHash());

Loading…
Cancel
Save