enhancements, bugfixes and additions to word index attribute storage

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1392 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 0f750c2ed6
commit f14d49fae9

@ -53,7 +53,8 @@ import java.io.IOException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Set;
import java.util.Iterator;
import java.util.Map;
import java.net.InetAddress;
import java.net.UnknownHostException;
import de.anomic.http.httpHeader;
@ -478,8 +479,12 @@ public class dir {
public static void deletePhrase(plasmaSwitchboard switchboard, String urlstring, String phrase, String descr) {
try {
final String urlhash = plasmaURL.urlHash(new URL(urlstring));
final Set words = plasmaCondenser.getWords(("yacyshare " + phrase + " " + descr).getBytes("UTF-8"));
switchboard.removeReferences(urlhash, words);
final Iterator words = plasmaCondenser.getWords(("yacyshare " + phrase + " " + descr).getBytes("UTF-8"));
Map.Entry entry;
while (words.hasNext()) {
entry = (Map.Entry) words.next();
switchboard.wordIndex.removeEntries(plasmaWordIndexEntry.word2hash((String) entry.getKey()), new String[] {urlhash}, true);
}
switchboard.urlPool.loadedURL.remove(urlhash);
} catch (Exception e) {
serverLog.logSevere("DIR", "INTERNAL ERROR in dir.deletePhrase", e);

@ -57,7 +57,6 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
@ -71,7 +70,7 @@ public final class plasmaCondenser {
private final static int numlength = 5;
//private Properties analysis;
private TreeMap words; // a string (the words) to (statProp) - relation
private TreeMap words; // a string (the words) to (wordStatProp) - relation
private HashMap sentences;
private int wordminsize;
private int wordcut;
@ -109,10 +108,13 @@ public final class plasmaCondenser {
return oldsize - words.size();
}
public Set getWords() {
return words.keySet();
public Iterator words() {
// returns an entry set iterator
// key is a String (the word), value is a wordStatProp Object
return words.entrySet().iterator();
}
/*
public int wordCount(String word) {
// number of occurrences of one word
// if the word did not occur, this simply returns 0
@ -144,7 +146,8 @@ public final class plasmaCondenser {
if (sp == null) return 0;
return sp.numOfPhrase;
}
*/
public static class wordStatProp {
// object carries statistics for words and sentences
@ -706,11 +709,11 @@ public final class plasmaCondenser {
}
*/
public static Set getWords(byte[] text) {
public static Iterator getWords(byte[] text) {
if (text == null) return null;
ByteArrayInputStream buffer = new ByteArrayInputStream(text);
plasmaCondenser condenser = new plasmaCondenser(buffer);
return condenser.getWords();
return condenser.words();
}
public static void main(String[] args) {

@ -1313,9 +1313,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} else {
HashMap urlCache = new HashMap(1);
urlCache.put(newEntry.hash(),newEntry);
ArrayList tmpEntities = new ArrayList(condenser.getWords().size());
ArrayList tmpEntities = new ArrayList(condenser.RESULT_SIMI_WORDS);
String language = plasmaWordIndexEntry.language(entry.url());
char doctype = plasmaWordIndexEntry.docType(document.getMimeType());
int quality = 0;
try {
quality = condenser.RESULT_INFORMATION_VALUE;
@ -1323,22 +1323,23 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
System.out.println("INTERNAL ERROR WITH CONDENSER.INFORMATION_VALUE: " + e.toString() + ": in URL " + newEntry.url().toString());
}
String language = plasmaWordIndexEntry.language(entry.url());
char doctype = plasmaWordIndexEntry.docType(document.getMimeType());
// iterate over all words
Iterator i = condenser.getWords().iterator();
Iterator i = condenser.words();
Map.Entry wentry;
plasmaCondenser.wordStatProp wordStat;
while (i.hasNext()) {
String word = (String) i.next();
wentry = (Map.Entry) i.next();
String word = (String) wentry.getKey();
wordStat = (plasmaCondenser.wordStatProp) wentry.getValue();
String wordHash = plasmaWordIndexEntry.word2hash(word);
plasmaWordIndexEntity wordIdxEntity = new plasmaWordIndexEntity(wordHash);
plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash,
condenser.wordCount(word),
wordStat.count,
condenser.RESULT_SIMI_WORDS,
condenser.RESULT_SIMI_SENTENCES,
condenser.wordPositionInText(word),
condenser.wordPositionInPhrase(word),
condenser.wordNumberOfPhrase(word),
wordStat.posInText,
wordStat.posInPhrase,
wordStat.numOfPhrase,
0,
docDate.getTime(),
quality, language, doctype, true);
@ -1347,7 +1348,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry));
}
//System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries");
words = condenser.getWords().size();
words = condenser.RESULT_SIMI_WORDS;
// transfering the index to the storage peer
String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntity[])tmpEntities.toArray(new plasmaWordIndexEntity[tmpEntities.size()]),urlCache,true,120000);
@ -1805,9 +1806,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// get set of words
// Set words = plasmaCondenser.getWords(getText(getResource(url,
// fetchOnline)));
Set words = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline)).getText());
Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline)).getText());
// delete all word references
int count = removeReferences(urlhash, words);
int count = removeReferences(urlhash, witer);
// finally delete the url entry itself
urlPool.loadedURL.remove(urlhash);
return count;
@ -1834,7 +1835,23 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
return count;
}
public int removeReferences(final String urlhash, final Iterator wordStatPropIterator) {
// sequentially delete all word references
// returns number of deletions
Map.Entry entry;
String word;
final String[] urlEntries = new String[] {urlhash};
int count = 0;
while (wordStatPropIterator.hasNext()) {
entry = (Map.Entry) wordStatPropIterator.next();
word = (String) entry.getKey();
// delete the URL reference in this word index
count += wordIndex.removeEntries(plasmaWordIndexEntry.word2hash(word), urlEntries, true);
}
return count;
}
public int adminAuthenticated(httpHeader header) {
String adminAccountBase64MD5 = getConfig("adminAccountBase64MD5", "");
if (adminAccountBase64MD5.length() == 0) return 2; // not necessary

@ -51,6 +51,7 @@ import java.io.File;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
import java.util.HashSet;
import java.util.Set;
@ -150,28 +151,33 @@ public final class plasmaWordIndex {
}
// iterate over all words
Iterator i = condenser.getWords().iterator();
Iterator i = condenser.words();
Map.Entry wentry;
String word;
plasmaWordIndexEntry entry;
plasmaWordIndexEntry ientry;
plasmaCondenser.wordStatProp wprop;
String wordHash;
while (i.hasNext()) {
word = (String) i.next();
wentry = (Map.Entry) i.next();
word = (String) wentry.getKey();
wprop = (plasmaCondenser.wordStatProp) wentry.getValue();
// if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
wordHash = plasmaWordIndexEntry.word2hash(word);
entry = new plasmaWordIndexEntry(urlHash,
condenser.wordCount(word),
ientry = new plasmaWordIndexEntry(urlHash,
wprop.count,
condenser.RESULT_SIMI_WORDS,
condenser.RESULT_SIMI_SENTENCES,
condenser.wordPositionInText(word),
condenser.wordPositionInPhrase(word),
condenser.wordNumberOfPhrase(word),
wprop.posInText,
wprop.posInPhrase,
wprop.numOfPhrase,
0,
urlModified.getTime(), quality, language, doctype, true);
addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry), false);
urlModified.getTime(),
quality, language, doctype, true);
addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), ientry), false);
}
// System.out.println("DEBUG: plasmaSearch.addPageIndex: added " +
// condenser.getWords().size() + " words, flushed " + c + " entries");
return condenser.getWords().size();
return condenser.RESULT_SIMI_WORDS;
}
public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) {

@ -209,7 +209,7 @@ public final class plasmaWordIndexEntry {
int posinphrase, // position of word in its phrase
int posofphrase, // number of the phrase where word appears
int distance, // word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
long time, // last-modified time of the document where word appears
long lastmodified, // last-modified time of the document where word appears
int quality, //
String language, //
char doctype, //
@ -232,7 +232,7 @@ public final class plasmaWordIndexEntry {
this.posinphrase = posinphrase;
this.posofphrase = posofphrase;
this.worddistance = distance;
this.lastModified = time;
this.lastModified = lastmodified;
this.quality = quality;
this.language = language.getBytes();
this.doctype = doctype;
@ -325,6 +325,10 @@ public final class plasmaWordIndexEntry {
public void combineDistance(plasmaWordIndexEntry oe) {
this.worddistance = this.worddistance + oe.worddistance + Math.abs(this.posintext - oe.posintext);
this.posintext = Math.min(this.posintext, oe.posintext);
if (this.posofphrase != oe.posofphrase) this.posinphrase = 0; // (unknown)
this.posofphrase = Math.min(this.posofphrase, oe.posofphrase);
this.wordcount = (this.wordcount + oe.wordcount) / 2;
}
public String getUrlHash() { return urlHash; }

@ -457,7 +457,8 @@ public final class yacyClient {
urlManager.addEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
// save the url entry
final plasmaWordIndexEntry entry;
if (urlEntry.word() == null)
if (urlEntry.word() == null) {
// the old way to define words
entry = new plasmaWordIndexEntry(
urlEntry.hash(),
urlEntry.wordCount(),
@ -468,7 +469,10 @@ public final class yacyClient {
urlEntry.doctype(),
false
);
else entry = urlEntry.word();
} else {
// the new way: the search-result-url transports all the attributes of word indexes
entry = urlEntry.word();
}
if (urlEntry.snippet() != null) {
// we don't store the snippets along the url entry, because they are search-specific.
// instead, they are placed in a snipped-search cache.

Loading…
Cancel
Save