- implemented more attributes to index entries

- implemented hand-over of new word index attributes during remote search
- implemented word-distance computation during search

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1382 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 026dcdfcc0
commit f4ffa9aee5

@ -218,7 +218,7 @@ public class IndexControl_p {
if (post.containsKey("urlhashdelete")) {
try {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash, null);
URL url = entry.url();
urlstring = htmlFilterContentScraper.urlNormalform(url);
prop.put("urlstring", "");
@ -268,7 +268,7 @@ public class IndexControl_p {
while (urlIter.hasNext()) {
indexEntry = (plasmaWordIndexEntry) urlIter.next();
try {
lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash());
lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), null);
if (lurl.toString() == null) {
switchboard.urlPool.loadedURL.remove(indexEntry.getUrlHash());
unknownURLEntries.add(indexEntry.getUrlHash());
@ -321,7 +321,7 @@ public class IndexControl_p {
URL url = new URL(urlstring);
urlhash = plasmaURL.urlHash(url);
prop.put("urlhash", urlhash);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash, null);
prop.put("result", genUrlProfile(switchboard, entry, urlhash));
} catch (MalformedURLException e) {
prop.put("urlstring", "bad url: " + urlstring);
@ -334,7 +334,7 @@ public class IndexControl_p {
if (post.containsKey("urlhashsearch")) {
try {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash, null);
URL url = entry.url();
urlstring = url.toString();
prop.put("urlstring", urlstring);
@ -395,7 +395,7 @@ public class IndexControl_p {
URL url = entry.url();
String referrer = null;
try {
referrer = switchboard.urlPool.loadedURL.getEntry(entry.referrerHash()).url().toString();
referrer = switchboard.urlPool.loadedURL.getEntry(entry.referrerHash(), null).url().toString();
} catch (IOException e) {
referrer = "<unknown>";
}
@ -452,7 +452,7 @@ public class IndexControl_p {
xi = (plasmaWordIndexEntry) en.next();
uh = new String[]{xi.getUrlHash(), Integer.toString(xi.posintext())};
try {
us = switchboard.urlPool.loadedURL.getEntry(uh[0]).url().toString();
us = switchboard.urlPool.loadedURL.getEntry(uh[0], null).url().toString();
tm.put(us, uh);
} catch (IOException e) {
tm.put(uh[0], uh);

@ -104,7 +104,7 @@ public class ViewFile {
// getting the urlEntry that belongs to the url hash
Entry urlEntry = null;
try {
urlEntry = sb.urlPool.loadedURL.getEntry(urlHash);
urlEntry = sb.urlPool.loadedURL.getEntry(urlHash, null);
} catch (IOException e) {
prop.put("error",2);
prop.put("viewMode",VIEW_MODE_NO_TEXT);

@ -245,7 +245,7 @@ public final class crawlOrder {
reason = reasonString;
// send lurl-Entry as response
try {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaURL.urlHash(url));
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaURL.urlHash(url), null);
response = "double";
switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare);
lurl = crypt.simpleEncode(entry.toString());

@ -47,11 +47,17 @@
// javac -classpath .:../../Classes search.java
// if the shell's current path is htroot/yacy
import java.io.IOException;
import java.util.HashSet;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchResult;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
@ -64,9 +70,7 @@ public final class search {
// return variable that accumulates replacements
final plasmaSwitchboard sb = (plasmaSwitchboard) ss;
serverObjects prop = new serverObjects();
if (prop == null || sb == null) { return null; }
//System.out.println("yacy: search received request = " + post.toString());
final String oseed = post.get("myseed", ""); // complete seed of the requesting peer
@ -79,12 +83,20 @@ public final class search {
final int count = post.getInt("count", 10); // maximum number of wanted results
// final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers
// Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time
// tell all threads to do nothing for a specific time
sb.wordIndex.intermission(2 * duetime);
sb.intermissionAllThreads(2 * duetime);
// store accessing peer
if (yacyCore.seedDB == null) {
yacyCore.log.logSevere("yacy.search: seed cache not initialized");
} else {
yacyCore.peerActions.peerArrival(yacySeed.genRemoteSeed(oseed, key), true);
}
// prepare search
final HashSet keyhashes = new HashSet(query.length() / plasmaWordIndexEntry.wordHashLength);
for (int i = 0; i < (query.length() / plasmaWordIndexEntry.wordHashLength); i++) {
keyhashes.add(query.substring(i * plasmaWordIndexEntry.wordHashLength, (i + 1) * plasmaWordIndexEntry.wordHashLength));
@ -92,9 +104,73 @@ public final class search {
final long timestamp = System.currentTimeMillis();
plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, new String[]{plasmaSearchQuery.ORDER_YBR, plasmaSearchQuery.ORDER_DATE, plasmaSearchQuery.ORDER_QUALITY},
count, duetime, ".*");
count, duetime, ".*");
squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
serverObjects prop = new serverObjects();
yacyCore.log.logInfo("INIT HASH SEARCH: " + squery.queryHashes + " - " + squery.wantedResults + " links");
long timestamp1 = System.currentTimeMillis();
plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache);
plasmaSearchResult acc = null;
int idxc = 0;
try {
idxc = theSearch.localSearch();
acc = theSearch.order();
} catch (IOException e) {
}
// result is a List of urlEntry elements
if ((idxc == 0) || (acc == null)) {
prop.put("totalcount", "0");
prop.put("linkcount", "0");
prop.put("references", "");
} else {
prop.put("totalcount", Integer.toString(acc.sizeOrdered()));
int i = 0;
StringBuffer links = new StringBuffer();
String resource = "";
//plasmaIndexEntry pie;
plasmaCrawlLURL.Entry urlentry;
plasmaSnippetCache.result snippet;
while ((acc.hasMoreElements()) && (i < squery.wantedResults)) {
urlentry = acc.nextElement();
snippet = sb.snippetCache.retrieve(urlentry.url(), squery.queryHashes, false, 260);
if (snippet.source == plasmaSnippetCache.ERROR_NO_MATCH) {
// suppress line: there is no match in that resource
} else {
if (snippet.line == null) {
resource = urlentry.toString();
} else {
resource = urlentry.toString(snippet.line);
}
if (resource != null) {
links.append("resource").append(i).append("=").append(resource).append(serverCore.crlfString);
i++;
}
}
}
prop.put("links", links.toString());
prop.put("linkcount", Integer.toString(i));
// prepare reference hints
Object[] ws = acc.getReferences(16);
StringBuffer refstr = new StringBuffer();
for (int j = 0; j < ws.length; j++)
refstr.append(",").append((String) ws[j]);
prop.put("references", (refstr.length() > 0) ? refstr.substring(1) : refstr.toString());
// add information about forward peers
prop.put("fwhop", ""); // hops (depth) of forwards that had been performed to construct this result
prop.put("fwsrc", ""); // peers that helped to construct this result
prop.put("fwrec", ""); // peers that would have helped to construct this result (recommendations)
}
prop = sb.searchFromRemote(squery);
// log
yacyCore.log.logInfo("EXIT HASH SEARCH: " + squery.queryHashes + " - " + idxc + " links found, " + prop.get("linkcount", "?") + " links selected, " + ((System.currentTimeMillis() - timestamp1) / 1000) + " seconds");
prop.put("searchtime", Long.toString(System.currentTimeMillis() - timestamp));
final int links = Integer.parseInt(prop.get("linkcount","0"));

@ -151,8 +151,8 @@ public final class plasmaCrawlLURL extends plasmaURL {
return e;
}
public synchronized Entry addEntry(Entry e, String initiatorHash, String executorHash, int stackType) {
if (e == null) { return null; }
public synchronized void addEntry(Entry e, String initiatorHash, String executorHash, int stackType) {
if (e == null) { return; }
try {
if (initiatorHash == null) { initiatorHash = dummyHash; }
if (executorHash == null) { executorHash = dummyHash; }
@ -165,10 +165,10 @@ public final class plasmaCrawlLURL extends plasmaURL {
case 5: lcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 6: gcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break;
}
return e;
return;
} catch (Exception ex) {
System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString());
return null;
return;
}
}
@ -176,27 +176,14 @@ public final class plasmaCrawlLURL extends plasmaURL {
gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
}
public synchronized Entry getEntry(String hash) throws IOException {
return new Entry(hash);
public synchronized Entry getEntry(String hash, plasmaWordIndexEntry searchedWord) throws IOException {
return new Entry(hash, searchedWord);
}
public synchronized Entry newEntry(Entry oldEntry) {
if (oldEntry == null) return null;
/*
* de.anomic.plasma.plasmaCrawlLURL.Entry.Entry(URL url, String descr,
* Date moddate, Date loaddate,
* String referrerHash,
* int copyCount,
* boolean localNeed,
* int quality,
* String language,
* char doctype,
* long size,
* int wordCount)
*/
return new Entry(
oldEntry.url(),
oldEntry.hash(),
oldEntry.descr(),
oldEntry.moddate(),
oldEntry.loaddate(),
@ -209,7 +196,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
oldEntry.size(),
oldEntry.wordCount());
}
public synchronized Entry newEntry(String propStr, boolean setGlobal) {
if (propStr.startsWith("{") && propStr.endsWith("}")) {
return new Entry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)), setGlobal);
@ -356,7 +343,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
urlHash = getUrlHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash);
try {
urle = getEntry(urlHash);
urle = getEntry(urlHash, null);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString());
initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash);
executorSeed = yacyCore.seedDB.getConnected(executorHash);
@ -397,72 +384,44 @@ public final class plasmaCrawlLURL extends plasmaURL {
public class Entry {
private URL url;
private String descr;
private Date moddate;
private Date loaddate;
private String urlHash;
private String referrerHash;
private int copyCount;
private String flags;
private int quality;
private String language;
private char doctype;
private long size;
private int wordCount;
private String snippet;
public Entry(
URL url,
String descr,
Date moddate,
Date loaddate,
String referrerHash,
int copyCount,
boolean localNeed,
int quality,
String language,
char doctype,
long size,
int wordCount
) {
this(url,null,descr,moddate,loaddate,referrerHash,copyCount,localNeed,quality,language,doctype,size,wordCount);
}
Entry(
URL url,
String theUrlHash,
String descr,
Date moddate,
Date loaddate,
String referrerHash,
int copyCount,
boolean localNeed,
int quality,
String language,
char doctype,
long size,
int wordCount
) {
// create new entry and store it into database
this.urlHash = (theUrlHash == null) ? urlHash(url) : theUrlHash;
this.url = url;
this.descr = (descr==null)?this.url.toString():descr;
this.moddate = moddate;
this.loaddate = loaddate;
this.referrerHash = (referrerHash == null) ? dummyHash : referrerHash;
this.copyCount = copyCount; // the number of remote (global) copies of this object without this one
this.flags = (localNeed) ? "L " : " ";
this.quality = quality;
this.language = (language==null)?"uk":language;
this.doctype = doctype;
this.size = size;
this.wordCount = wordCount;
this.snippet = null;
store();
}
public Entry(String urlHash) throws IOException {
private URL url;
private String descr;
private Date moddate;
private Date loaddate;
private String urlHash;
private String referrerHash;
private int copyCount;
private String flags;
private int quality;
private String language;
private char doctype;
private long size;
private int wordCount;
private String snippet;
private plasmaWordIndexEntry word;
public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, long size, int wordCount) {
// create new entry and store it into database
this.urlHash = urlHash(url);
this.url = url;
this.descr = (descr == null) ? this.url.toString() : descr;
this.moddate = moddate;
this.loaddate = loaddate;
this.referrerHash = (referrerHash == null) ? dummyHash : referrerHash;
this.copyCount = copyCount; // the number of remote (global) copies of this object without this one
this.flags = (localNeed) ? "L " : " ";
this.quality = quality;
this.language = (language == null) ? "uk" : language;
this.doctype = doctype;
this.size = size;
this.wordCount = wordCount;
this.snippet = null;
this.word = null;
store();
}
public Entry(String urlHash, plasmaWordIndexEntry searchedWord) throws IOException {
// generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
@ -488,6 +447,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
this.size = kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[11], "UTF-8"));
this.wordCount = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[12], "UTF-8"));
this.snippet = null;
this.word = searchedWord;
return;
}
} catch (Exception e) {
@ -519,8 +479,9 @@ public final class plasmaCrawlLURL extends plasmaURL {
this.doctype = prop.getProperty("dt", "t").charAt(0);
this.size = Long.parseLong(prop.getProperty("size", "0"));
this.wordCount = Integer.parseInt(prop.getProperty("wc", "0"));
this.snippet = prop.getProperty("snippet", "");
if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null);
this.snippet = prop.getProperty("snippet", "");
if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null);
this.word = (prop.containsKey("word")) ? new plasmaWordIndexEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null;
store();
//}
} catch (Exception e) {
@ -623,6 +584,10 @@ public final class plasmaCrawlLURL extends plasmaURL {
return snippet;
}
public plasmaWordIndexEntry word() {
return word;
}
private StringBuffer corePropList() {
// generate a parseable string; this is a simple property-list
final StringBuffer corePropStr = new StringBuffer(300);
@ -640,23 +605,14 @@ public final class plasmaCrawlLURL extends plasmaURL {
.append(",dt=") .append(doctype)
.append(",lang=") .append(language)
.append(",url=") .append(crypt.simpleEncode(url.toString()))
.append(",descr=") .append(crypt.simpleEncode(descr));
.append(",descr=") .append(crypt.simpleEncode(descr));
if (this.word != null) {
// append also word properties
corePropStr.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toExternalForm()));
}
return corePropStr;
// return
// "hash=" + urlHash +
// ",referrer=" + referrerHash +
// ",mod=" + shortDayFormatter.format(moddate) +
// ",load=" + shortDayFormatter.format(loaddate) +
// ",size=" + size +
// ",wc=" + wordCount +
// ",cc=" + copyCount +
// ",local=" + ((local()) ? "true" : "false") +
// ",q=" + serverCodings.enhancedCoder.encodeBase64Long(quality, urlQualityLength) +
// ",dt=" + doctype +
// ",lang=" + language +
// ",url=" + crypt.simpleEncode(url.toString()) +
// ",descr=" + crypt.simpleEncode(descr);
} catch (Exception e) {
// serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
// if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
@ -666,6 +622,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
}
}
/*
public String toString(int posintext, int posinphrase, int posofphrase) {
// add information needed for remote transport
final StringBuffer core = corePropList();
@ -678,15 +635,9 @@ public final class plasmaCrawlLURL extends plasmaURL {
.append(",posofphraseint=").append(posofphrase)
.append("}");
return core.toString();
// return
// "{" + core +
// ",posintext=" + posintext +
// ",posinphrase=" + posinphrase +
// ",posofphraseint=" + posofphrase +
// "}";
}
*/
public String toString(String snippet) {
// add information needed for remote transport
final StringBuffer core = corePropList();
@ -694,7 +645,8 @@ public final class plasmaCrawlLURL extends plasmaURL {
core.ensureCapacity(core.length() + snippet.length()*2);
core.insert(0,"{");
core.append(",snippet=").append(crypt.simpleEncode(snippet)).append("}");
core.append(",snippet=").append(crypt.simpleEncode(snippet));
core.append("}");
return core.toString();
//return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
@ -751,7 +703,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
String hash = null;
try {
hash = new String(e);
return new Entry(hash);
return new Entry(hash, null);
} catch (IOException ex) {
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + hash);
}

@ -253,7 +253,7 @@ public class plasmaDbImporter extends Thread {
String urlHash = importWordIdxEntry.getUrlHash();
if ((this.importUrlDB.exists(urlHash)) && (!this.homeUrlDB.exists(urlHash))) try {
// importing the new url
plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash);
plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash, importWordIdxEntry);
urlCounter++;
this.homeUrlDB.newEntry(urlEntry);

@ -236,7 +236,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
entry = preorder.next();
// find the url entry
try {
page = urlStore.getEntry(entry.getUrlHash());
page = urlStore.getEntry(entry.getUrlHash(), entry);
// add a result
acc.addResult(entry, page);
} catch (IOException e) {

@ -134,7 +134,6 @@ import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroTables;
import de.anomic.server.serverAbstractSwitch;
import de.anomic.server.serverCodings;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
import de.anomic.server.serverInstantThread;
import de.anomic.server.serverObjects;
@ -1322,9 +1321,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaWordIndexEntity wordIdxEntity = new plasmaWordIndexEntity(wordHash);
plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash,
condenser.wordCount(word),
condenser.RESULT_SIMI_WORDS,
condenser.RESULT_SIMI_SENTENCES,
condenser.wordPositionInText(word),
condenser.wordPositionInPhrase(word),
condenser.wordNumberOfPhrase(word),
0,
docDate.getTime(),
quality, language, doctype, true);
wordIdxEntity.addEntry(wordIdxEntry);
@ -1575,7 +1577,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String lurl = (String) page.get("lurl");
if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.addEntry(urlPool.loadedURL.newEntry(propStr, true), yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1);
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.newEntry(propStr, true);
urlPool.loadedURL.addEntry(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ŸberflŸssig/doppelt?
urlPool.noticeURL.remove(entry.hash());
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'");
return true;
@ -1760,85 +1763,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
}
public serverObjects searchFromRemote(plasmaSearchQuery query) {
// tell all threads to do nothing for a specific time
wordIndex.intermission(2 * query.maximumTime);
intermissionAllThreads(2 * query.maximumTime);
query.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
serverObjects prop = new serverObjects();
try {
log.logInfo("INIT HASH SEARCH: " + query.queryHashes + " - " + query.wantedResults + " links");
long timestamp = System.currentTimeMillis();
plasmaSearchEvent theSearch = new plasmaSearchEvent(query, log, wordIndex, urlPool.loadedURL, snippetCache);
int idxc = theSearch.localSearch();
plasmaSearchResult acc = theSearch.order();
// result is a List of urlEntry elements
if (acc == null) {
prop.put("totalcount", "0");
prop.put("linkcount", "0");
prop.put("references", "");
} else {
prop.put("totalcount", Integer.toString(acc.sizeOrdered()));
int i = 0;
StringBuffer links = new StringBuffer();
String resource = "";
//plasmaIndexEntry pie;
plasmaCrawlLURL.Entry urlentry;
plasmaSnippetCache.result snippet;
while ((acc.hasMoreElements()) && (i < query.wantedResults)) {
urlentry = acc.nextElement();
snippet = snippetCache.retrieve(urlentry.url(), query.queryHashes, false, 260);
if (snippet.source == plasmaSnippetCache.ERROR_NO_MATCH) {
// suppress line: there is no match in that resource
} else {
if (snippet.line == null) {
resource = urlentry.toString();
} else {
resource = urlentry.toString(snippet.line);
}
if (resource != null) {
links.append("resource").append(i).append("=").append(resource).append(serverCore.crlfString);
i++;
}
}
}
prop.put("links", links.toString());
prop.put("linkcount", Integer.toString(i));
// prepare reference hints
Object[] ws = acc.getReferences(16);
StringBuffer refstr = new StringBuffer();
for (int j = 0; j < ws.length; j++) refstr.append(",").append((String) ws[j]);
prop.put("references", (refstr.length() > 0)?refstr.substring(1):refstr.toString());
}
// add information about forward peers
prop.put("fwhop", ""); // hops (depth) of forwards that had been performed to construct this result
prop.put("fwsrc", ""); // peers that helped to construct this result
prop.put("fwrec", ""); // peers that would have helped to construct this result (recommendations)
// log
log.logInfo("EXIT HASH SEARCH: " + query.queryHashes + " - " + idxc + " links found, " +
prop.get("linkcount", "?") + " links selected, " +
((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
return prop;
} catch (IOException e) {
return null;
}
}
public serverObjects action(String actionName, serverObjects actionInput) {
// perform an action. (not used)
// perform an action. (not used)
return null;
}
public String toString() {
// it is possible to use this method in the cgi pages.
// actually it is used there for testing purpose
@ -1856,7 +1785,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// determine the url string
try {
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.getEntry(urlhash);
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.getEntry(urlhash, null);
URL url = entry.url();
if (url == null)
return 0;

@ -290,7 +290,7 @@ public class plasmaSwitchboardQueue {
if (referrerURL == null) {
if ((referrerHash == null) || (referrerHash.equals(plasmaURL.dummyHash))) return null;
try {
referrerURL = lurls.getEntry(referrerHash).url();
referrerURL = lurls.getEntry(referrerHash, null).url();
} catch (IOException e) {
referrerURL = null;
return null;

@ -76,7 +76,7 @@ public class plasmaURLPool {
plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
if (ne != null) return ne.url();
try {
plasmaCrawlLURL.Entry le = loadedURL.getEntry(urlhash);
plasmaCrawlLURL.Entry le = loadedURL.getEntry(urlhash, null);
if (le != null) return le.url();
} catch (IOException e) {}
plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);

@ -160,9 +160,12 @@ public final class plasmaWordIndex {
wordHash = plasmaWordIndexEntry.word2hash(word);
entry = new plasmaWordIndexEntry(urlHash,
condenser.wordCount(word),
condenser.RESULT_SIMI_WORDS,
condenser.RESULT_SIMI_SENTENCES,
condenser.wordPositionInText(word),
condenser.wordPositionInPhrase(word),
condenser.wordNumberOfPhrase(word),
0,
urlModified.getTime(), quality, language, doctype, true);
addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry), false);
}

@ -70,7 +70,7 @@ public final class plasmaWordIndexAssortment {
4, // occurrence counter
8, // timestamp of last access
plasmaWordIndexEntry.urlHashLength, // corresponding URL hash
plasmaWordIndexEntry.attrSpaceLong // URL attributes
plasmaWordIndexEntry.attrSpace // URL attributes
};
// class variables

@ -359,7 +359,7 @@ public final class plasmaWordIndexDistribution {
while (urlIter.hasNext()) {
indexEntry = (plasmaWordIndexEntry) urlIter.next();
try {
lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash());
lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), indexEntry);
if ((lurl == null) || (lurl.url() == null)) {
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
@ -399,7 +399,7 @@ public final class plasmaWordIndexDistribution {
while ((urlIter.hasNext()) && (count > 0)) {
indexEntry = (plasmaWordIndexEntry) urlIter.next();
try {
lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash());
lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), indexEntry);
if ((lurl == null) || (lurl.url()==null)) {
unknownURLEntries.add(indexEntry.getUrlHash());
} else {

@ -95,10 +95,10 @@ public final class plasmaWordIndexEntity {
kt = new kelondroTree(theLocation, cacheSize);
} catch (IOException e) {
theLocation.delete();
kt = new kelondroTree(theLocation, cacheSize, plasmaURL.urlHashLength, plasmaWordIndexEntry.attrSpaceLong, false);
kt = new kelondroTree(theLocation, cacheSize, plasmaURL.urlHashLength, plasmaWordIndexEntry.attrSpace, false);
} else {
// create new index file
kt = new kelondroTree(theLocation, cacheSize, plasmaURL.urlHashLength, plasmaWordIndexEntry.attrSpaceLong, false);
kt = new kelondroTree(theLocation, cacheSize, plasmaURL.urlHashLength, plasmaWordIndexEntry.attrSpace, false);
}
return kt; // everyone who get this should close it when finished!
}
@ -157,6 +157,16 @@ public final class plasmaWordIndexEntity {
} catch (IOException e) {}
}
public plasmaWordIndexEntry getEntry(String urlhash) throws IOException {
if (theTmpMap == null) {
byte[][] n = theIndex.get(urlhash.getBytes());
if (n == null) return null;
return new plasmaWordIndexEntry(new String(n[0]), new String(n[1]));
} else {
return (plasmaWordIndexEntry) theTmpMap.get(urlhash);
}
}
public boolean contains(String urlhash) throws IOException {
if (theTmpMap == null) return (theIndex.get(urlhash.getBytes()) != null); else return (theTmpMap.containsKey(urlhash));
}
@ -390,12 +400,17 @@ public final class plasmaWordIndexEntity {
System.out.println("DEBUG: JOIN METHOD BY TEST");
plasmaWordIndexEntity conj = new plasmaWordIndexEntity(null); // start with empty search result
Iterator se = small.elements(true);
plasmaWordIndexEntry ie;
plasmaWordIndexEntry ie0, ie1;
long stamp = System.currentTimeMillis();
try {
while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) {
ie = (plasmaWordIndexEntry) se.next();
if (large.contains(ie)) conj.addEntry(ie);
ie0 = (plasmaWordIndexEntry) se.next();
ie1 = large.getEntry(ie0.getUrlHash());
if (ie1 != null) {
// this is a hit. Calculate word distance:
ie0.combineDistance(ie1);
conj.addEntry(ie0);
}
}
} catch (kelondroException e) {
//serverLog.logSevere("PLASMA", "joinConstructiveByTest: Database corrupt (" + e.getMessage() + "), deleting index");
@ -449,6 +464,7 @@ public final class plasmaWordIndexEntity {
}
} else {
// we have found the same urls in different searches!
ie1.combineDistance(ie2);
conj.addEntry(ie1);
try {
if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break;

@ -67,24 +67,26 @@ public final class plasmaWordIndexEntry {
public static final int urlHashLength = yacySeedDB.commonHashLength; // 12
// the size of the index entry attributes
//public static final int attrSpaceShort = 12;
public static final int attrSpaceLong = 18;
public static final int attrSpace = 24;
// the associated hash
private final String urlHash;
// discrete values
private int count; // words in file
private int hitcount; // words in file
private int wordcount;
private int phrasecount;
private int posintext; // first position of the word in text as number of word; 0=unknown or irrelevant position
private int posinphrase; // position within a phrase of the word
private int posofphrase; // position of the phrase in the text as count of sentences; 0=unknown; 1=path; 2=keywords; 3=headline; >4: in text
private int worddistance;
private long lastModified;// calculated by using last-modified
private int quality; // result of a heuristic on the source file
private byte[] language; // essentially the country code (the TLD as heuristic), two letters lowercase only
private char doctype; // type of source
private char localflag; // indicates if the index was created locally
// some doctypes:
// doctypes:
public static final char DT_PDFPS = 'p';
public static final char DT_TEXT = 't';
public static final char DT_HTML = 'h';
@ -97,6 +99,19 @@ public final class plasmaWordIndexEntry {
public static final char DT_BINARY = 'b';
public static final char DT_UNKNOWN = 'u';
// appearance locations: (used for flags)
public static final int AP_TITLE = 0; // title tag from html header
public static final int AP_H1 = 1; // h0-tag
public static final int AP_H2 = 2;
public static final int AP_H3 = 3;
public static final int AP_H4 = 4;
public static final int AP_H5 = 5;
public static final int AP_H6 = 6;
public static final int AP_ANCHOR = 7; // anchor description
public static final int AP_URL = 8; // word inside an url
public static final int AP_IMG = 9; // tag inside image references
public static final int AP_TAG = 10; // for tagged indexeing (i.e. using mp3 tags)
// local flag attributes
public static final char LT_LOCAL = 'L';
public static final char LT_GLOBAL = 'G';
@ -187,23 +202,22 @@ public final class plasmaWordIndexEntry {
// the class instantiation can only be done by a plasmaStore method
// therefore they are all public
public plasmaWordIndexEntry(String urlHash,
int count, // how often appears this word in the text
int posintext,
int posinphrase,
int posofphrase,
long time,
int quality,
String language,
char doctype,
int hitcount, // how often appears this word in the text
int wordcount, // total number of words
int phrasecount, // total number of phrases
int posintext, // position of word in all words
int posinphrase, // position of word in its phrase
int posofphrase, // number of the phrase where word appears
int distance, // word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
long time, // last-modified time of the document where word appears
int quality, //
String language, //
char doctype, //
boolean local) {
// more needed attributes:
// - int: length of text / total number of words
// - int: length of text / total number of sentences
// - long: update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
// - int: word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
// - char: category of appearance (header, title, section, text, anchor-descr, image-tag etc)
// - boolean: appears in title, appears in header, appears in ....
// - boolean: appears in title, appears in header, anchor-descr, image-tag etc
// - int: url-length (shorter are better)
// - int: url-number of components / length of path
// - int: length of description tag / title tag (longer are better)
@ -211,10 +225,13 @@ public final class plasmaWordIndexEntry {
if ((language == null) || (language.length() != plasmaURL.urlLanguageLength)) language = "uk";
this.urlHash = urlHash;
this.count = count;
this.hitcount = hitcount;
this.wordcount = wordcount;
this.phrasecount = phrasecount;
this.posintext = posintext;
this.posinphrase = posinphrase;
this.posofphrase = posofphrase;
this.worddistance = distance;
this.lastModified = time;
this.quality = quality;
this.language = language.getBytes();
@ -225,15 +242,18 @@ public final class plasmaWordIndexEntry {
public plasmaWordIndexEntry(String urlHash, String code) {
// the code is not parsed but used later on
this.urlHash = urlHash;
this.count = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(6, 8));
this.posintext = (code.length() >= 14) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(12, 14)) : 0;
this.posinphrase = (code.length() >= 15) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(14, 16)) : 0;
this.posofphrase = (code.length() >= 16) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(16, 18)) : 0;
this.hitcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(6, 8));
this.lastModified = plasmaWordIndex.reverseMicroDateDays((int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(3, 6)));
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(0, 3));
this.language = code.substring(8, 10).getBytes();
this.doctype = code.charAt(10);
this.localflag = code.charAt(11);
this.posintext = (code.length() >= 14) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(12, 14)) : 0;
this.posinphrase = (code.length() >= 15) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(14, 16)) : 0;
this.posofphrase = (code.length() >= 17) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(16, 18)) : 0;
this.worddistance = (code.length() >= 19) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(18, 20)) : 0;
this.wordcount = (code.length() >= 21) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(20, 22)) : 0;
this.phrasecount = (code.length() >= 23) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(22, 24)) : 0;
}
public plasmaWordIndexEntry(String external) {
@ -246,10 +266,13 @@ public final class plasmaWordIndexEntry {
}
// set values
this.urlHash = pr.getProperty("h", "");
this.count = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("c", "A"));
this.hitcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("c", "A"));
this.wordcount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("w", "__"));
this.phrasecount = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("p", "__"));
this.posintext = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("t", "__"));
this.posinphrase = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("r", "__"));
this.posofphrase = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("o", "__"));
this.worddistance = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("i", "__"));
this.lastModified = plasmaWordIndex.reverseMicroDateDays((int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("a", "A")));
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("q", "__"));
this.language = pr.getProperty("l", "uk").getBytes();
@ -260,85 +283,64 @@ public final class plasmaWordIndexEntry {
public String toEncodedForm() {
// attention: this integrates NOT the URL hash into the encoding
// if you need a complete dump, use toExternalForm()
StringBuffer buf = new StringBuffer(attrSpaceLong);
StringBuffer buf = new StringBuffer(attrSpace);
buf.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.quality, plasmaURL.urlQualityLength))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(plasmaWordIndex.microDateDays(this.lastModified), 3))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.count, 2))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.hitcount, 2))
.append(new String(this.language))
.append(this.doctype)
.append(this.localflag); // 3 + 3 + 2 + 2 + 1 + 1 = 12 bytes
buf.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posintext, 2))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posinphrase, 2))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posofphrase, 2));
.append(this.localflag)
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posintext, 2))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posinphrase, 2))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posofphrase, 2))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.worddistance, 2))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.wordcount, 2))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.phrasecount, 2)); // 3+3+2+2+1+1+2+2+2+2+2+2= 24 bytes
return buf.toString();
}
}
public String toExternalForm() {
public String toExternalForm() {
StringBuffer str = new StringBuffer(61);
str.append("{")
.append("h=").append(this.urlHash)
.append( "h=").append(this.urlHash)
.append(",q=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.quality, plasmaURL.urlQualityLength))
.append(",a=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(plasmaWordIndex.microDateDays(this.lastModified), 3))
.append(",c=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.count, 2))
.append(",c=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.hitcount, 2))
.append(",l=").append(new String(this.language))
.append(",d=").append(this.doctype)
.append(",f=").append(this.localflag)
.append(",t=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posintext, 2))
.append(",r=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posinphrase, 2))
.append(",o=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posofphrase, 2))
.append(",i=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.worddistance, 2))
.append(",w=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.wordcount, 2))
.append(",p=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.phrasecount, 2))
.append("}");
return str.toString();
}
public String getUrlHash() {
return urlHash;
}
public int getQuality() {
return quality;
}
public int getVirtualAge() {
return plasmaWordIndex.microDateDays(lastModified);
}
public long getLastModified() {
return lastModified;
public void combineDistance(plasmaWordIndexEntry oe) {
this.worddistance = this.worddistance + oe.worddistance + Math.abs(this.posintext - oe.posintext);
}
public int getCount() {
return count;
}
public int posintext() {
return posintext;
}
public int posinphrase() {
return posinphrase;
}
public int posofphrase() {
return posofphrase;
}
public String getLanguage() {
return new String(language);
}
public char getType() {
return doctype;
}
public boolean isLocal() {
return localflag == LT_LOCAL;
}
public String getUrlHash() { return urlHash; }
public int getQuality() { return quality; }
public int getVirtualAge() { return plasmaWordIndex.microDateDays(lastModified); }
public long getLastModified() { return lastModified; }
public int getCount() { return hitcount; }
public int posintext() { return posintext; }
public int posinphrase() { return posinphrase; }
public int posofphrase() { return posofphrase; }
public int worddistance() { return worddistance; }
public int wordcount() { return wordcount; }
public int phrasecount() { return phrasecount; }
public String getLanguage() { return new String(language); }
public char getType() { return doctype; }
public boolean isLocal() { return localflag == LT_LOCAL; }
public static void main(String[] args) {
// outputs the word hash to a given word

@ -61,13 +61,10 @@ import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.sql.Date;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Map;
import java.util.Vector;
import de.anomic.http.httpHeader;

@ -59,7 +59,6 @@ import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.plasma.plasmaURLPattern;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.plasmaSearchProfile;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
@ -441,8 +440,7 @@ public final class yacyClient {
//System.out.println("yacyClient: search result = " + result.toString()); // debug
final int results = Integer.parseInt((String) result.get("count"));
//System.out.println("***result count " + results);
plasmaCrawlLURL.Entry link;
// create containers
final int words = wordhashes.length() / plasmaWordIndexEntry.wordHashLength;
plasmaWordIndexEntryContainer[] container = new plasmaWordIndexEntryContainer[words];
@ -451,21 +449,31 @@ public final class yacyClient {
}
// insert results to containers
plasmaCrawlLURL.Entry lEntry;
plasmaCrawlLURL.Entry urlEntry;
for (int n = 0; n < results; n++) {
// get one single search result
lEntry = urlManager.newEntry((String) result.get("resource" + n), true);
if (lEntry != null && blacklist.isListed(lEntry.url().getHost().toLowerCase(), lEntry.url().getPath())) { continue; } // block with backlist
link = urlManager.addEntry(lEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
urlEntry = urlManager.newEntry((String) result.get("resource" + n), true);
if (urlEntry != null && blacklist.isListed(urlEntry.url().getHost().toLowerCase(), urlEntry.url().getPath())) { continue; } // block with backlist
urlManager.addEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
// save the url entry
final plasmaWordIndexEntry entry = new plasmaWordIndexEntry(link.hash(), link.wordCount(), 0, 0, 0,
plasmaWordIndex.microDateDays(link.moddate()), link.quality(),
link.language(), link.doctype(), false);
if (link.snippet() != null) {
final plasmaWordIndexEntry entry;
if (urlEntry.word() == null)
entry = new plasmaWordIndexEntry(
urlEntry.hash(),
urlEntry.wordCount(),
0, 0, 0, 0, 0, 0,
urlEntry.moddate().getTime(),
urlEntry.quality(),
urlEntry.language(),
urlEntry.doctype(),
false
);
else entry = urlEntry.word();
if (urlEntry.snippet() != null) {
// we don't store the snippets along the url entry, because they are search-specific.
// instead, they are placed in a snipped-search cache.
//System.out.println("--- RECEIVED SNIPPET '" + link.snippet() + "'");
snippets.storeToCache(wordhashes, link.hash(), link.snippet());
snippets.storeToCache(wordhashes, urlEntry.hash(), urlEntry.snippet());
}
// add the url entry to the word indexes
for (int m = 0; m < words; m++) {

@ -754,7 +754,7 @@ public final class yacy {
String urlHash = importWordIdxEntry.getUrlHash();
if ((importUrlDB.exists(urlHash)) && (!homeUrlDB.exists(urlHash))) try {
// importing the new url
plasmaCrawlLURL.Entry urlEntry = importUrlDB.getEntry(urlHash);
plasmaCrawlLURL.Entry urlEntry = importUrlDB.getEntry(urlHash, null);
urlCounter++;
homeUrlDB.newEntry(urlEntry);
@ -861,7 +861,7 @@ public final class yacy {
wordIdxEntry = (plasmaWordIndexEntry) wordIdxEntries.next();
String urlHash = wordIdxEntry.getUrlHash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash);
plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash, null);
urlCounter++;
/*plasmaCrawlLURL.Entry newEntry =*/ minimizedUrlDB.newEntry(urlEntry);
if (urlCounter % 500 == 0) {

Loading…
Cancel
Save