- enhanced monitoring of ranking parameters

for details, please try http://localhost:8080/IndexControlRWIs_p.html
- fixed computation of ranking ordering in some cases

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4220 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent bd5673efbe
commit c527969185

@ -169,7 +169,7 @@ public class Bookmarks {
bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.getBookmark(urlHash);
if (bookmark == null) {
// try to get the bookmark from the LURL database
indexURLEntry urlentry = switchboard.wordIndex.loadedURL.load(urlHash, null);
indexURLEntry urlentry = switchboard.wordIndex.loadedURL.load(urlHash, null, 0);
plasmaParserDocument document = null;
if (urlentry != null) {
indexURLEntry.Components comp = urlentry.comp();

@ -167,7 +167,7 @@ public class CrawlResults {
urlHash = sb.wordIndex.loadedURL.getUrlHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash);
try {
urle = sb.wordIndex.loadedURL.load(urlHash, null);
urle = sb.wordIndex.loadedURL.load(urlHash, null, 0);
indexURLEntry.Components comp = urle.comp();
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString());
initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash);

@ -92,9 +92,9 @@
<input type="radio" name="lines" value="1000" />1000&nbsp;&nbsp;
</dd>
<dt class="TableCellDark">Ordering of list:</dt>
<dd><input type="radio" name="ordering" value="0" checked="checked" />by URL&nbsp;&nbsp;
<dd><input type="radio" name="ordering" value="2" checked="checked" />by Ranking&nbsp;&nbsp;
<input type="radio" name="ordering" value="0"/>by URL&nbsp;&nbsp;
<input type="radio" name="ordering" value="1" />by URL Hash&nbsp;&nbsp;
<!-- <input type="radio" name="ordering" value="2" />by Ranking&nbsp;&nbsp;-->
</dd>
<dt class="TableCellLight"></dt>
<dd><input type="submit" name="urllist" value="List Selected URLs" />
@ -134,30 +134,61 @@
<form action="IndexControlRWIs_p.html" method="post" enctype="multipart/form-data">
<p>
<table border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader"><td>&nbsp;</td><td>hash</td><td>url</td><td>pos</td><td>phrase</td><td>urlcomps</td><td>urllength</td><td width="60%">props</td></tr>
<tr class="TableHeader">
<td colspan="3">Resource</td>
<td colspan="8">Negative Ranking Factors</td>
<td colspan="7">Positive Ranking Factors</td>
<td rowspan="2">Reverse Normalized Weighted Ranking Sum</td>
</tr>
<tr class="TableHeader">
<td>&nbsp;</td>
<td>hash</td>
<td>url</td>
<td>dom length</td>
<td>ybr</td>
<td>url comps</td>
<td>url length</td>
<td>pos in text</td>
<td>pos of phrase</td>
<td>pos in phrase</td>
<td>word distance</td>
<td>date</td>
<td>words in title</td>
<td>words in text</td>
<td>local links</td>
<td>remote links</td>
<td>hitcount</td>
<td>props</td>
<td></td>
</tr>
#{urlList}#
<tr class="TableCellLight">
#(urlExists)#
<td><input type="checkbox" id="urlhx.#[urlhxCount]#" name="urlhx.#[urlhxCount]#" checked value="#[urlhxValue]#" align="top" />
<td class="TableCellDark"><input type="checkbox" id="urlhx.#[urlhxCount]#" name="urlhx.#[urlhxCount]#" checked value="#[urlhxValue]#" align="top" />
<label for="urlhx.#[urlhxCount]#" class="tt">#[urlhxValue]#</label></td>
<td>&lt;unresolved URL Hash&gt;</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</span>
<td colspan="15"></td>
::
<td><input type="checkbox" id="urlhx.#[urlhxCount]#" name="urlhx.#[urlhxCount]#" #(urlhxChecked)#::checked="checked" #(/urlhxChecked)#value="#[urlhxValue]#" align="top" />
<label for="urlhx.#[urlhxCount]#" class="tt"></label></td>
<td><a href="/IndexControlURLs_p.html?keystring=#[keyString]#&amp;keyhash=#[keyHash]#&amp;urlhash=#[urlhxValue]#&amp;urlstringsearch=&amp;urlstring=#[urlString]#" class="tt">#[urlhxValue]#</a></td>
<td><a href="#[urlString]#">#[urlStringShort]#</a></td>
<td>#[pos]#</td>
<td>#[phrase]#</td>
<td>#[urlcomps]#</td>
<td>#[urllength]#</td>
<td class="TableCellDark">#[domlength]#</td>
<td class="TableCellDark">#[ybr]#</td>
<td class="TableCellDark">#[urlcomps]#</td>
<td class="TableCellDark">#[urllength]#</td>
<td class="TableCellDark">#[pos]#</td>
<td class="TableCellDark">#[phrase]#</td>
<td class="TableCellDark">#[posinphrase]#</td>
<td class="TableCellDark">#[worddistance]#</td>
<td>#[date]#</td>
<td>#[wordsintitle]#</td>
<td>#[wordsintext]#</td>
<td>#[llocal]#</td>
<td>#[lother]#</td>
<td>#[hitcount]#</td>
<td>#[props]#</td>
<td align="right" class="TableCellDark">#[ranking]#</td>
#(/urlExists)#
</tr>
#{/urlList}#

@ -30,6 +30,7 @@ import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
@ -43,10 +44,13 @@ import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProcess;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.urlPattern.abstractURLPattern;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverDate;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyClient;
@ -198,7 +202,7 @@ public class IndexControlRWIs_p {
indexURLEntry lurl;
while (urlIter.hasNext()) {
iEntry = (indexRWIEntry) urlIter.next();
lurl = sb.wordIndex.loadedURL.load(iEntry.urlHash(), null);
lurl = sb.wordIndex.loadedURL.load(iEntry.urlHash(), null, 0);
if (lurl == null) {
unknownURLEntries.add(iEntry.urlHash());
urlIter.remove();
@ -255,7 +259,7 @@ public class IndexControlRWIs_p {
yacyURL url;
for (int i=0; i<urlx.length; i++) {
urlHashes.add(urlx[i]);
indexURLEntry e = sb.wordIndex.loadedURL.load(urlx[i], null);
indexURLEntry e = sb.wordIndex.loadedURL.load(urlx[i], null, 0);
sb.wordIndex.loadedURL.remove(urlx[i]);
if (e != null) {
url = e.comp().url();
@ -283,7 +287,7 @@ public class IndexControlRWIs_p {
yacyURL url;
for (int i=0; i<urlx.length; i++) {
urlHashes.add(urlx[i]);
indexURLEntry e = sb.wordIndex.loadedURL.load(urlx[i], null);
indexURLEntry e = sb.wordIndex.loadedURL.load(urlx[i], null, 0);
sb.wordIndex.loadedURL.remove(urlx[i]);
if (e != null) {
url = e.comp().url();
@ -357,7 +361,7 @@ public class IndexControlRWIs_p {
}
private static plasmaWordIndex.Finding genSearchresult(serverObjects prop, plasmaSwitchboard sb, String keyhash, kelondroBitfield filter, boolean urlfetch, int sortorder) {
final plasmaWordIndex.Finding finding = sb.wordIndex.retrieveURLs(keyhash, filter, false, -1, urlfetch, sortorder);
final plasmaWordIndex.Finding finding = sb.wordIndex.retrieveURLs(new plasmaSearchQuery(keyhash, -1, filter), urlfetch, sortorder, sb.getRanking());
if (finding.size() == 0) {
prop.put("searchresult", 2);
prop.put("searchresult_wordhash", keyhash);
@ -395,37 +399,53 @@ public class IndexControlRWIs_p {
prop.put("genUrlList_ordering", ordering);
int i = 0;
yacyURL url;
Iterator iter = finding.hit();
plasmaWordIndex.Item entry;
Iterator iter = finding.urls();
indexURLEntry entry;
String us;
long rn = -1;
while (iter.hasNext()) {
entry = (plasmaWordIndex.Item) iter.next();
us = entry.url().comp().url().toNormalform(false, false);
entry = (indexURLEntry) iter.next();
us = entry.comp().url().toNormalform(false, false);
if (rn == -1) rn = entry.ranking();
prop.put("genUrlList_urlList_"+i+"_urlExists", "1");
prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxCount", i);
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", entry.index().urlHash());
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", entry.word().urlHash());
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_keyString", keystring);
prop.put("genUrlList_urlList_"+i+"_urlExists_keyHash", keyhash);
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlString", us);
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlStringShort", (us.length() > 60) ? (us.substring(0, 60) + "...") : us);
prop.put("genUrlList_urlList_"+i+"_urlExists_pos", entry.index().posintext());
prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", entry.index().posofphrase());
prop.put("genUrlList_urlList_"+i+"_urlExists_urlcomps", entry.index().urlcomps());
prop.put("genUrlList_urlList_"+i+"_urlExists_urllength", entry.index().urllength());
prop.put("genUrlList_urlList_"+i+"_urlExists_urlStringShort", (us.length() > 40) ? (us.substring(0, 20) + "<br>" + us.substring(20, 40) + "...") : ((us.length() > 30) ? (us.substring(0, 20) + "<br>" + us.substring(20)) : us));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_ranking", (entry.ranking() - rn));
prop.put("genUrlList_urlList_"+i+"_urlExists_domlength", yacyURL.domLengthEstimation(entry.hash()));
prop.put("genUrlList_urlList_"+i+"_urlExists_ybr", plasmaSearchRankingProcess.ybr(entry.hash()));
prop.put("genUrlList_urlList_"+i+"_urlExists_date", serverDate.shortDayTime(new Date(entry.word().lastModified())));
prop.put("genUrlList_urlList_"+i+"_urlExists_wordsintitle", entry.word().wordsintitle());
prop.put("genUrlList_urlList_"+i+"_urlExists_wordsintext", entry.word().wordsintext());
prop.put("genUrlList_urlList_"+i+"_urlExists_phrasesintext", entry.word().phrasesintext());
prop.put("genUrlList_urlList_"+i+"_urlExists_llocal", entry.word().llocal());
prop.put("genUrlList_urlList_"+i+"_urlExists_lother", entry.word().lother());
prop.put("genUrlList_urlList_"+i+"_urlExists_hitcount", entry.word().hitcount());
prop.put("genUrlList_urlList_"+i+"_urlExists_worddistance", entry.word().worddistance());
prop.put("genUrlList_urlList_"+i+"_urlExists_pos", entry.word().posintext());
prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", entry.word().posofphrase());
prop.put("genUrlList_urlList_"+i+"_urlExists_posinphrase", entry.word().posinphrase());
prop.put("genUrlList_urlList_"+i+"_urlExists_urlcomps", entry.word().urlcomps());
prop.put("genUrlList_urlList_"+i+"_urlExists_urllength", entry.word().urllength());
prop.put("genUrlList_urlList_"+i+"_urlExists_props",
((entry.index().flags().get(plasmaCondenser.flag_cat_hasimage)) ? "contains images, " : "") +
((entry.index().flags().get(plasmaCondenser.flag_cat_hasaudio)) ? "contains audio, " : "") +
((entry.index().flags().get(plasmaCondenser.flag_cat_hasvideo)) ? "contains video, " : "") +
((entry.index().flags().get(plasmaCondenser.flag_cat_hasapp)) ? "contains applications, " : "") +
((entry.index().flags().get(indexRWIEntry.flag_app_url)) ? "appears in url, " : "") +
((entry.index().flags().get(indexRWIEntry.flag_app_descr)) ? "appears in description, " : "") +
((entry.index().flags().get(indexRWIEntry.flag_app_author)) ? "appears in author, " : "") +
((entry.index().flags().get(indexRWIEntry.flag_app_tags)) ? "appears in tags, " : "") +
((entry.index().flags().get(indexRWIEntry.flag_app_reference)) ? "appears in reference, " : "") +
((entry.index().flags().get(indexRWIEntry.flag_app_emphasized)) ? "appears emphasized" : "")
((entry.word().flags().get(plasmaCondenser.flag_cat_indexof)) ? "appears on index page, " : "") +
((entry.word().flags().get(plasmaCondenser.flag_cat_hasimage)) ? "contains images, " : "") +
((entry.word().flags().get(plasmaCondenser.flag_cat_hasaudio)) ? "contains audio, " : "") +
((entry.word().flags().get(plasmaCondenser.flag_cat_hasvideo)) ? "contains video, " : "") +
((entry.word().flags().get(plasmaCondenser.flag_cat_hasapp)) ? "contains applications, " : "") +
((entry.word().flags().get(indexRWIEntry.flag_app_url)) ? "appears in url, " : "") +
((entry.word().flags().get(indexRWIEntry.flag_app_descr)) ? "appears in description, " : "") +
((entry.word().flags().get(indexRWIEntry.flag_app_author)) ? "appears in author, " : "") +
((entry.word().flags().get(indexRWIEntry.flag_app_tags)) ? "appears in tags, " : "") +
((entry.word().flags().get(indexRWIEntry.flag_app_reference)) ? "appears in reference, " : "") +
((entry.word().flags().get(indexRWIEntry.flag_app_emphasized)) ? "appears emphasized, " : "") +
((yacyURL.probablyRootURL(entry.word().urlHash())) ? "probably root url" : "")
);
prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", entry.index().posofphrase());
prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", entry.index().posofphrase());
prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", entry.word().posofphrase());
prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", entry.word().posofphrase());
try {
url = new yacyURL(us, null);
} catch (MalformedURLException e) {

@ -76,7 +76,7 @@ public class IndexControlURLs_p {
}
if (post.containsKey("urlhashdelete")) {
indexURLEntry entry = sb.wordIndex.loadedURL.load(urlhash, null);
indexURLEntry entry = sb.wordIndex.loadedURL.load(urlhash, null, 0);
if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {
@ -106,7 +106,7 @@ public class IndexControlURLs_p {
yacyURL url = new yacyURL(urlstring, null);
urlhash = url.hash();
prop.put("urlhash", urlhash);
indexURLEntry entry = sb.wordIndex.loadedURL.load(urlhash, null);
indexURLEntry entry = sb.wordIndex.loadedURL.load(urlhash, null, 0);
if (entry == null) {
prop.putHTML("urlstring", "unknown url: " + urlstring);
prop.put("urlhash", "");
@ -120,7 +120,7 @@ public class IndexControlURLs_p {
}
if (post.containsKey("urlhashsearch")) {
indexURLEntry entry = sb.wordIndex.loadedURL.load(urlhash, null);
indexURLEntry entry = sb.wordIndex.loadedURL.load(urlhash, null, 0);
if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash);
} else {
@ -172,7 +172,7 @@ public class IndexControlURLs_p {
}
indexURLEntry.Components comp = entry.comp();
String referrer = null;
indexURLEntry le = (entry.referrerHash() == null) ? null : switchboard.wordIndex.loadedURL.load(entry.referrerHash(), null);
indexURLEntry le = (entry.referrerHash() == null) ? null : switchboard.wordIndex.loadedURL.load(entry.referrerHash(), null, 0);
if (le == null) {
referrer = "<unknown>";
} else {

@ -30,6 +30,7 @@ import java.util.Iterator;
import java.util.Map;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSwitchboard;
@ -63,6 +64,7 @@ public class Ranking_p {
rankingParameters.put(plasmaSearchRankingProfile.PHRASESINTEXT, "Phrases In Text");
rankingParameters.put(plasmaSearchRankingProfile.POSINTEXT, "Position In Text");
rankingParameters.put(plasmaSearchRankingProfile.POSOFPHRASE, "Position Of Phrase");
rankingParameters.put(plasmaSearchRankingProfile.POSINPHRASE, "Position In Phrase");
rankingParameters.put(plasmaSearchRankingProfile.PREFER, "Application Of Prefer Pattern");
rankingParameters.put(plasmaSearchRankingProfile.URLCOMPINTOPLIST, "URL Component Appears In Toplist");
rankingParameters.put(plasmaSearchRankingProfile.URLCOMPS, "URL Components");
@ -127,6 +129,9 @@ public class Ranking_p {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
final plasmaSwitchboard sb = (plasmaSwitchboard) env;
// clean up all search events
plasmaSearchEvent.cleanupEvents(true);
// case if no values are requested
if ((post == null) || (env == null)) {
// we create empty entries for template strings

@ -109,7 +109,7 @@ public class ViewFile {
if (urlHash.length() > 0) {
// getting the urlEntry that belongs to the url hash
indexURLEntry urlEntry = null;
urlEntry = sb.wordIndex.loadedURL.load(urlHash, null);
urlEntry = sb.wordIndex.loadedURL.load(urlHash, null, 0);
if (urlEntry == null) {
prop.put("error", "2");
prop.put("viewMode",VIEW_MODE_NO_TEXT);

@ -261,7 +261,7 @@ public final class crawlOrder {
reason = reasonString;
// send lurl-Entry as response
indexURLEntry entry;
entry = switchboard.wordIndex.loadedURL.load(url.hash(), null);
entry = switchboard.wordIndex.loadedURL.load(url.hash(), null, 0);
if (entry == null) {
response = "rejected";
lurl = "";

@ -133,7 +133,7 @@ public final class search {
long urlRetrievalAllTime = 0, snippetComputationAllTime = 0;
if ((query.length() == 0) && (abstractSet != null)) {
// this is _not_ a normal search, only a request for index abstracts
theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, plasmaSearchQuery.catchall_constraint);
theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, plasmaSearchQuery.catchall_constraint, false);
theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");
@ -162,7 +162,7 @@ public final class search {
} else {
// retrieve index containers from search request
theQuery = new plasmaSearchQuery(null, queryhashes, excludehashes, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, constraint);
theQuery = new plasmaSearchQuery(null, queryhashes, excludehashes, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, constraint, false);
theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");

@ -169,7 +169,7 @@ public class yacysearch {
kelondroBitfield constraint = post.containsKey("constraint") ? new kelondroBitfield(4, post.get("constraint", "______")) : plasmaSearchQuery.catchall_constraint;
if (indexof) {
constraint = new kelondroBitfield();
constraint = new kelondroBitfield(4);
constraint.set(plasmaCondenser.flag_cat_indexof, true);
}
@ -225,7 +225,7 @@ public class yacysearch {
return prop;
}
final String recommendHash = post.get("recommendref", ""); // urlhash
indexURLEntry urlentry = sb.wordIndex.loadedURL.load(recommendHash, null);
indexURLEntry urlentry = sb.wordIndex.loadedURL.load(recommendHash, null, 0);
if (urlentry != null) {
indexURLEntry.Components comp = urlentry.comp();
plasmaParserDocument document;
@ -266,7 +266,8 @@ public class yacysearch {
((globalsearch) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL),
"",
20,
constraint);
constraint,
false);
plasmaSearchProcessing localTiming = new plasmaSearchProcessing(4 * theQuery.maximumTime / 10, theQuery.displayResults());
String client = (String) header.get("CLIENTIP"); // the search client who initiated the search

@ -286,7 +286,7 @@ public class SitemapParser extends DefaultHandler {
String dbocc = this.switchboard.urlExists(nexturlhash);
if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
// the url was already loaded. we need to check the date
indexURLEntry oldEntry = this.switchboard.wordIndex.loadedURL.load(nexturlhash, null);
indexURLEntry oldEntry = this.switchboard.wordIndex.loadedURL.load(nexturlhash, null, 0);
if (oldEntry != null) {
Date modDate = oldEntry.moddate();
// check if modDate is null

@ -234,6 +234,7 @@ public final class httpc {
// do NOT remove this check; in case that everything works fine this call does nothing
// but if in any arror case connections stay open, this will ensure that the peer keeps running and the host server is not blocked from working
checkIdleConnections();
assert timeout != 0;
// register new connection
this.hashIndex = objCounter;
@ -401,9 +402,10 @@ public final class httpc {
this.initTime = System.currentTimeMillis();
this.lastIO = System.currentTimeMillis();
this.socket.setKeepAlive(false);
this.socket.connect(address, timeout);
// setting socket timeout and keep alive behaviour
this.socket.setSoTimeout(timeout); // waiting time for read
// get the connection
this.socket.connect(address, timeout);
if (incomingByteCountAccounting != null) {
this.clientInputByteCount = new httpdByteCountInputStream(this.socket.getInputStream(),incomingByteCountAccounting);

@ -87,21 +87,22 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder implements kelondr
//return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
// the normalizedEntry must be a normalized indexEntry
kelondroBitfield flags = t.flags();
long r = ((255 - yacyURL.domLengthNormalized(t.urlHash())) << ranking.coeff_domlength)
+ ((255 - (plasmaSearchRankingProcess.ybr(t.urlHash()) << 4 )) << ranking.coeff_ybr)
+ ((255 - (((t.virtualAge() - min.virtualAge() ) << 8) / (1 + max.virtualAge() - min.virtualAge()) )) << ranking.coeff_date)
long r =
((256 - yacyURL.domLengthNormalized(t.urlHash())) << ranking.coeff_domlength)
+ ((256 - (plasmaSearchRankingProcess.ybr(t.urlHash()) << 4)) << ranking.coeff_ybr)
+ ((t.urlcomps() == 0) ? 0 : ((256 - (((t.urlcomps() - min.urlcomps() ) << 8) / (1 + max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps))
+ ((t.urllength() == 0) ? 0 : ((256 - (((t.urllength() - min.urllength() ) << 8) / (1 + max.urllength() - min.urllength()) )) << ranking.coeff_urllength))
+ ((t.posintext() == 0) ? 0 : ((256 - (((t.posintext() - min.posintext() ) << 8) / (1 + max.posintext() - min.posintext()) )) << ranking.coeff_posintext))
+ ((t.posofphrase() == 0) ? 0 : ((256 - (((t.posofphrase() - min.posofphrase() ) << 8) / (1 + max.posofphrase() - min.posofphrase()) )) << ranking.coeff_posofphrase))
+ ((t.posinphrase() == 0) ? 0 : ((256 - (((t.posinphrase() - min.posinphrase() ) << 8) / (1 + max.posinphrase() - min.posinphrase()) )) << ranking.coeff_posinphrase))
+ ((256 - (((t.worddistance() - min.worddistance() ) << 8) / (1 + max.worddistance() - min.worddistance()) )) << ranking.coeff_worddistance)
+ ( (((t.virtualAge() - min.virtualAge() ) << 8) / (1 + max.virtualAge() - min.virtualAge()) ) << ranking.coeff_date)
+ ( (((t.wordsintitle() - min.wordsintitle() ) << 8) / (1 + max.wordsintitle() - min.wordsintitle()) ) << ranking.coeff_wordsintitle)
+ ( (((t.wordsintext() - min.wordsintext() ) << 8) / (1 + max.wordsintext() - min.wordsintext()) ) << ranking.coeff_wordsintext)
+ ( (((t.phrasesintext()- min.phrasesintext()) << 8) / (1 + max.phrasesintext()- min.phrasesintext()) ) << ranking.coeff_phrasesintext)
+ ( (((t.llocal() - min.llocal() ) << 8) / (1 + max.llocal() - min.llocal()) ) << ranking.coeff_llocal)
+ ( (((t.lother() - min.lother() ) << 8) / (1 + max.lother() - min.lother()) ) << ranking.coeff_lother)
+ ( (((t.hitcount() - min.hitcount() ) << 8) / (1 + max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount)
+ ((255 - (((t.urllength() - min.urllength() ) << 8) / (1 + max.urllength() - min.urllength()) )) << ranking.coeff_urllength)
+ ((255 - (((t.urlcomps() - min.urlcomps() ) << 8) / (1 + max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps)
+ ((255 - (((t.posintext() - min.posintext() ) << 8) / (1 + max.posintext() - min.posintext()) )) << ranking.coeff_posintext)
+ ((255 - (((t.posofphrase() - min.posofphrase() ) << 8) / (1 + max.posofphrase() - min.posofphrase()) )) << ranking.coeff_posofphrase)
+ ((255 - (((t.posinphrase() - min.posinphrase() ) << 8) / (1 + max.posinphrase() - min.posinphrase()) )) << ranking.coeff_posinphrase)
+ ((255 - (((t.worddistance() - min.worddistance() ) << 8) / (1 + max.worddistance() - min.worddistance()) )) << ranking.coeff_worddistance)
+ (((flags.get(indexRWIEntry.flag_app_url)) ? 255 << ranking.coeff_appurl : 0))
+ (((flags.get(indexRWIEntry.flag_app_descr)) ? 255 << ranking.coeff_appdescr : 0))
+ (((flags.get(indexRWIEntry.flag_app_author)) ? 255 << ranking.coeff_appauthor : 0))

@ -117,7 +117,8 @@ public class indexURLEntry {
private kelondroRow.Entry entry;
private String snippet;
private indexRWIEntry word; // this is only used if the url is transported via remote search requests
private long ranking; // during generation of a search result this value is set
public indexURLEntry(
yacyURL url,
String descr,
@ -163,6 +164,7 @@ public class indexURLEntry {
//System.out.println("===DEBUG=== " + load.toString() + ", " + decodeDate(col_load).toString());
this.snippet = null;
this.word = null;
this.ranking = 0;
}
private void encodeDate(int col, Date d) {
@ -184,10 +186,11 @@ public class indexURLEntry {
return s.toString().getBytes();
}
public indexURLEntry(kelondroRow.Entry entry, indexRWIEntry searchedWord) {
public indexURLEntry(kelondroRow.Entry entry, indexRWIEntry searchedWord, long ranking) {
this.entry = entry;
this.snippet = null;
this.word = searchedWord;
this.ranking = ranking;
}
public indexURLEntry(Properties prop){
@ -243,6 +246,7 @@ public class indexURLEntry {
if (prop.containsKey("wi")) {
this.word = new indexRWIRowEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("wi", "")));
}
this.ranking = 0;
}
private StringBuffer corePropList() {
@ -301,6 +305,10 @@ public class indexURLEntry {
return this.entry.getColString(col_hash, null);
}
public long ranking() {
return this.ranking;
}
public indexURLEntry.Components comp() {
ArrayList cl = nxTools.strings(this.entry.getCol("comp", null), "UTF-8");
return new indexURLEntry.Components(

@ -413,6 +413,7 @@ public class kelondroRowCollection {
if (this.chunkcount < isortlimit) {
isort(0, this.chunkcount, new byte[this.rowdef.objectsize]);
this.sortBound = this.chunkcount;
assert this.isSorted();
return;
}
byte[] swapspace = new byte[this.rowdef.objectsize];
@ -555,6 +556,7 @@ public class kelondroRowCollection {
public synchronized boolean isSorted() {
assert (this.rowdef.objectOrder != null);
if (chunkcount <= 1) return true;
if (chunkcount != this.sortBound) return false;
for (int i = 0; i < chunkcount - 1; i++) {
//System.out.println("*" + new String(get(i).getColBytes(0)));
if (compare(i, i + 1) > 0) {

@ -193,7 +193,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
// we need to import the url
// getting the url entry
indexURLEntry urlEntry = this.importWordIndex.loadedURL.load(urlHash, null);
indexURLEntry urlEntry = this.importWordIndex.loadedURL.load(urlHash, null, 0);
if (urlEntry != null) {
/* write it into the home url db */

@ -153,7 +153,7 @@ public final class plasmaCrawlLURL {
return 0;
}
public synchronized indexURLEntry load(String urlHash, indexRWIEntry searchedWord) {
public synchronized indexURLEntry load(String urlHash, indexRWIEntry searchedWord, long ranking) {
// generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
@ -165,7 +165,7 @@ public final class plasmaCrawlLURL {
try {
kelondroRow.Entry entry = urlIndexFile.get(urlHash.getBytes());
if (entry == null) return null;
return new indexURLEntry(entry, searchedWord);
return new indexURLEntry(entry, searchedWord, ranking);
} catch (IOException e) {
return null;
}
@ -176,7 +176,7 @@ public final class plasmaCrawlLURL {
indexURLEntry oldEntry;
try {
if (exists(entry.hash())) {
oldEntry = load(entry.hash(), null);
oldEntry = load(entry.hash(), null, 0);
} else {
oldEntry = null;
}
@ -342,7 +342,7 @@ public final class plasmaCrawlLURL {
if (this.iter == null) { return null; }
if (this.iter.hasNext()) { e = (kelondroRow.Entry) this.iter.next(); }
if (e == null) { return null; }
return new indexURLEntry(e, null);
return new indexURLEntry(e, null, 0);
}
public final void remove() {

@ -455,7 +455,7 @@ public final class plasmaCrawlStacker extends Thread {
// check if the url is double registered
String dbocc = sb.crawlQueues.urlExists(entry.url().hash());
indexURLEntry oldEntry = this.sb.wordIndex.loadedURL.load(entry.url().hash(), null);
indexURLEntry oldEntry = this.sb.wordIndex.loadedURL.load(entry.url().hash(), null, 0);
boolean recrawl = (oldEntry != null) && ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) > profile.recrawlIfOlder());
// apply recrawl rule
if ((dbocc != null) && (!(recrawl))) {

@ -8,7 +8,7 @@
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy: $
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
@ -247,7 +247,7 @@ public class plasmaDHTChunk {
urlIter.remove();
continue;
}
lurl = wordIndex.loadedURL.load(iEntry.urlHash(), iEntry);
lurl = wordIndex.loadedURL.load(iEntry.urlHash(), iEntry, 0);
if ((lurl == null) || (lurl.comp() == null) || (lurl.comp().url() == null)) {
//yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash());
notBoundCounter++;

@ -205,22 +205,15 @@ public final class plasmaSearchEvent {
} else {
// prepare result vector directly without worker threads
process.startTimer();
indexRWIEntry entry;
indexURLEntry page;
indexURLEntry uentry;
ResultEntry resultEntry;
synchronized (rankedCache) {
Iterator indexRWIEntryIterator = rankedCache.entries();
while ((indexRWIEntryIterator.hasNext()) && (resultList.size() < (query.neededResults()))) {
Iterator urlIterator = rankedCache.entries(wordIndex, true);
while ((urlIterator.hasNext()) && (resultList.size() < (query.neededResults()))) {
// fetch next entry
entry = (indexRWIEntry) indexRWIEntryIterator.next();
page = wordIndex.loadedURL.load(entry.urlHash(), entry);
uentry = (indexURLEntry) urlIterator.next();
if (page == null) {
registerFailure(entry.urlHash(), "url does not exist in lurl-db");
continue;
}
resultEntry = obtainResultEntry(page, (snippetComputationAllTime < 300) ? 1 : 0);
resultEntry = obtainResultEntry(uentry, (snippetComputationAllTime < 300) ? 1 : 0);
if (resultEntry == null) continue; // the entry had some problems, cannot be used
urlRetrievalAllTime += resultEntry.dbRetrievalTime;
snippetComputationAllTime += resultEntry.snippetComputationTime;
@ -581,7 +574,7 @@ public final class plasmaSearchEvent {
}
}
indexURLEntry page = wordIndex.loadedURL.load(entry.urlHash(), entry);
indexURLEntry page = wordIndex.loadedURL.load(entry.urlHash(), entry, 0);
if (page == null) {
registerFailure(entry.urlHash(), "url does not exist in lurl-db");
continue;
@ -609,7 +602,7 @@ public final class plasmaSearchEvent {
private indexRWIEntry nextOrder() {
synchronized (rankedCache) {
Iterator i = rankedCache.entries();
Iterator i = rankedCache.entries(null, false);
indexRWIEntry entry;
String urlhash;
while (i.hasNext()) {

@ -85,28 +85,57 @@ public final class plasmaSearchQuery {
public int domMaxTargets;
public int maxDistance;
public kelondroBitfield constraint;
public boolean allofconstraint;
public boolean onlineSnippetFetch;
public plasmaSearchQuery(String queryString, TreeSet queryHashes, TreeSet excludeHashes, int maxDistance, String prefer, int contentdom,
boolean onlineSnippetFetch,
int lines, int offset, long maximumTime, String urlMask,
int domType, String domGroupName, int domMaxTargets,
kelondroBitfield constraint) {
this.queryString = queryString;
this.queryHashes = queryHashes;
this.excludeHashes = excludeHashes;
this.maxDistance = maxDistance;
this.prefer = prefer;
this.contentdom = contentdom;
public plasmaSearchQuery(String queryString, int lines, kelondroBitfield constraint) {
if ((queryString.length() == 12) && (kelondroBase64Order.enhancedCoder.wellformed(queryString.getBytes()))) {
this.queryString = null;
this.queryHashes = new TreeSet();
this.excludeHashes = new TreeSet();
this.queryHashes.add(queryString);
} else {
this.queryString = queryString;
TreeSet[] cq = cleanQuery(queryString);
this.queryHashes = plasmaCondenser.words2hashes(cq[0]);
this.excludeHashes = plasmaCondenser.words2hashes(cq[1]);
}
this.maxDistance = Integer.MAX_VALUE;
this.prefer = "";
this.contentdom = CONTENTDOM_ALL;
this.linesPerPage = lines;
this.offset = offset;
this.maximumTime = maximumTime;
this.urlMask = urlMask;
this.domType = domType;
this.domGroupName = domGroupName;
this.domMaxTargets = domMaxTargets;
this.offset = 0;
this.maximumTime = 10000;
this.urlMask = ".*";
this.domType = SEARCHDOM_LOCAL;
this.domGroupName = "";
this.domMaxTargets = 0;
this.constraint = constraint;
this.onlineSnippetFetch = onlineSnippetFetch;
this.allofconstraint = false;
this.onlineSnippetFetch = false;
}
public plasmaSearchQuery(String queryString, TreeSet queryHashes, TreeSet excludeHashes, int maxDistance, String prefer, int contentdom,
boolean onlineSnippetFetch,
int lines, int offset, long maximumTime, String urlMask,
int domType, String domGroupName, int domMaxTargets,
kelondroBitfield constraint, boolean allofconstraint) {
this.queryString = queryString;
this.queryHashes = queryHashes;
this.excludeHashes = excludeHashes;
this.maxDistance = maxDistance;
this.prefer = prefer;
this.contentdom = contentdom;
this.linesPerPage = lines;
this.offset = offset;
this.maximumTime = maximumTime;
this.urlMask = urlMask;
this.domType = domType;
this.domGroupName = domGroupName;
this.domMaxTargets = domMaxTargets;
this.constraint = constraint;
this.allofconstraint = allofconstraint;
this.onlineSnippetFetch = onlineSnippetFetch;
}
public int neededResults() {

@ -61,6 +61,7 @@ public final class plasmaSearchRankingProcess {
private int globalcount;
private HashMap urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic
private int[] c; // flag counter
public plasmaSearchRankingProcess(plasmaSearchQuery query, plasmaSearchProcessing process, plasmaSearchRankingProfile ranking, int maxentries) {
// we collect the urlhashes and construct a list with urlEntry objects
@ -74,6 +75,8 @@ public final class plasmaSearchRankingProcess {
this.globalcount = 0;
this.urlhashes = new HashMap();
this.ref = new kelondroMScoreCluster();
c = new int[32];
for (int i = 0; i < 32; i++) {c[i] = 0;}
}
public void insert(indexContainer container, boolean local) {
@ -83,12 +86,12 @@ public final class plasmaSearchRankingProcess {
assert (container != null);
if (container.size() == 0) return;
process.startTimer();
if (process != null) process.startTimer();
if (this.order == null) {
this.order = new indexRWIEntryOrder(ranking);
}
this.order.extend(container);
process.yield("normalizing", container.size());
if (process != null) process.yield("normalizing", container.size());
/*
container.setOrdering(o, 0);
@ -96,7 +99,7 @@ public final class plasmaSearchRankingProcess {
*/
// normalize entries and get ranking
process.startTimer();
if (process != null) process.startTimer();
Iterator i = container.entries();
this.pageAcc = new TreeMap();
indexRWIEntry iEntry, l;
@ -106,9 +109,15 @@ public final class plasmaSearchRankingProcess {
while (i.hasNext()) {
iEntry = (indexRWIEntry) i.next();
if (iEntry.urlHash().length() != container.row().primaryKeyLength) continue;
// increase flag counts
for (int j = 0; j < 32; j++) {
if (iEntry.flags().get(j)) {c[j]++;}
}
// kick out entries that are too bad acording to current findings
r = new Long(order.cardinal(iEntry));
if ((pageAcc.size() >= maxentries) && (r.longValue() > biggestEntry)) continue;
if ((maxentries >= 0) && (pageAcc.size() >= maxentries) && (r.longValue() > biggestEntry)) continue;
// check constraints
if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) && (!(iEntry.flags().allOf(query.constraint)))) continue; // filter out entries that do not match the search constraint
@ -118,7 +127,7 @@ public final class plasmaSearchRankingProcess {
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue;
}
if (pageAcc.size() < maxentries) {
if ((maxentries < 0) || (pageAcc.size() < maxentries)) {
if (urlhashes.containsKey(iEntry.urlHash())) continue;
while (pageAcc.containsKey(r)) r = new Long(r.longValue() + 1);
pageAcc.put(r, iEntry);
@ -145,7 +154,38 @@ public final class plasmaSearchRankingProcess {
if (container.size() > query.neededResults()) remove(true, true);
process.yield(plasmaSearchProcessing.PRESORT, container.size());
if (process != null) process.yield(plasmaSearchProcessing.PRESORT, container.size());
}
public class rIterator implements Iterator {
boolean urls;
Iterator r;
plasmaWordIndex wi;
public rIterator(plasmaWordIndex wi, boolean fetchURLs) {
// if fetchURLs == true, this iterates indexURLEntry objects, otherwise it iterates indexRWIEntry objects
this.urls = fetchURLs;
this.r = pageAcc.entrySet().iterator();
this.wi = wi;
}
public boolean hasNext() {
return r.hasNext();
}
public Object next() {
Map.Entry entry = (Map.Entry) r.next();
indexRWIEntry ientry = (indexRWIEntry) entry.getValue();
if (urls) {
return wi.loadedURL.load(ientry.urlHash(), ientry, ((Long) entry.getKey()).longValue());
} else {
return ientry;
}
}
public void remove() {
throw new UnsupportedOperationException();
}
}
public int size() {
@ -153,6 +193,10 @@ public final class plasmaSearchRankingProcess {
return pageAcc.size();
}
public int[] flagCount() {
return c;
}
public int filteredCount() {
return this.filteredCount;
}
@ -170,9 +214,9 @@ public final class plasmaSearchRankingProcess {
return iEntry;
}
public Iterator entries() {
// returns an iterator of indexRWIEntry objects in the ranked order, best entry first
return this.pageAcc.values().iterator();
public Iterator entries(plasmaWordIndex wi, boolean fetchURLs) {
// if fetchURLs == true, this iterates indexURLEntry objects, otherwise it iterates indexRWIEntry objects
return new rIterator(wi, fetchURLs);
}
public Set getReferences(int count) {

@ -64,6 +64,7 @@ public class plasmaSearchRankingProfile {
public static final String HITCOUNT = "hitcount";
public static final String POSINTEXT = "posintext";
public static final String POSOFPHRASE = "posofphrase";
public static final String POSINPHRASE = "posinphrase";
public static final String WORDDISTANCE = "worddistance";
public static final String APPURL = "appurl";
public static final String APPDESCR = "appdescr";
@ -154,6 +155,7 @@ public class plasmaSearchRankingProfile {
coeff_hitcount = parseMap(coeff, HITCOUNT, coeff_hitcount);
coeff_posintext = parseMap(coeff, POSINTEXT, coeff_posintext);
coeff_posofphrase = parseMap(coeff, POSOFPHRASE, coeff_posofphrase);
coeff_posinphrase = parseMap(coeff, POSINPHRASE, coeff_posinphrase);
coeff_worddistance = parseMap(coeff, WORDDISTANCE, coeff_worddistance);
coeff_appurl = parseMap(coeff, APPURL, coeff_appurl);
coeff_appdescr = parseMap(coeff, APPDESCR, coeff_appdescr);
@ -207,6 +209,7 @@ public class plasmaSearchRankingProfile {
ext.put(prefix + HITCOUNT, Integer.toString(coeff_hitcount));
ext.put(prefix + POSINTEXT, Integer.toString(coeff_posintext));
ext.put(prefix + POSOFPHRASE, Integer.toString(coeff_posofphrase));
ext.put(prefix + POSINPHRASE, Integer.toString(coeff_posinphrase));
ext.put(prefix + WORDDISTANCE, Integer.toString(coeff_worddistance));
ext.put(prefix + APPURL, Integer.toString(coeff_appurl));
ext.put(prefix + APPDESCR, Integer.toString(coeff_appdescr));

@ -1499,7 +1499,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (urlhash.equals(yacyURL.dummyHash)) return null;
yacyURL ne = crawlQueues.getURL(urlhash);
if (ne != null) return ne;
indexURLEntry le = wordIndex.loadedURL.load(urlhash, null);
indexURLEntry le = wordIndex.loadedURL.load(urlhash, null, 0);
if (le != null) return le.comp().url();
return null;
}
@ -2541,7 +2541,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// finally, delete the url entry
// determine the url string
indexURLEntry entry = wordIndex.loadedURL.load(urlhash, null);
indexURLEntry entry = wordIndex.loadedURL.load(urlhash, null, 0);
if (entry == null) return 0;
indexURLEntry.Components comp = entry.comp();
if (comp.url() == null) return 0;

@ -328,7 +328,7 @@ public class plasmaSwitchboardQueue {
public yacyURL referrerURL() {
if (referrerURL == null) {
if ((referrerHash == null) || (referrerHash.equals(yacyURL.dummyHash))) return null;
indexURLEntry entry = lurls.load(referrerHash, null);
indexURLEntry entry = lurls.load(referrerHash, null, 0);
if (entry == null) referrerURL = null; else referrerURL = entry.comp().url();
}
return referrerURL;

@ -389,89 +389,106 @@ public final class plasmaWordIndex implements indexRI {
return containers;
}
public Finding retrieveURLs(String keyhash, kelondroBitfield filter, boolean all, int maxcount, boolean loadurl, int sortorder) {
public Finding retrieveURLs(plasmaSearchQuery query, boolean loadurl, int sortorder, plasmaSearchRankingProfile ranking) {
// search for a word hash and generate a list of url links
// sortorder: 0 = hash, 1 = url, 2 = ranking
assert query.queryHashes.size() == 1;
final TreeSet mi = new TreeSet();
String keyhash = (String) query.queryHashes.first();
kelondroBitfield filter = query.constraint;
indexContainer index = getContainer(keyhash, null);
final TreeMap tm = new TreeMap();
final TreeSet mi = new TreeSet();
final ArrayList indexes = new ArrayList();
indexRWIEntry ientry;
indexURLEntry uentry;
final int[] c = new int[32];
for (int i = 0; i < 32; i++) {c[i] = 0;}
if ((index != null) && (index.size() != 0)) {
final Iterator en = index.entries();
// generate a new map where the urls are sorted (not by hash but by the url text)
indexRWIEntry ientry;
indexURLEntry uentry;
loop: while (en.hasNext()) {
ientry = (indexRWIEntry) en.next();
// test if ientry matches with filter
if (filter != null) {
// if all = true: let only entries pass that has all matching bits
// if all = false: let all entries pass that has at least one matching bit
if (all) {
for (int i = 0; i < 32; i++) {
if ((filter.get(i)) && (!ientry.flags().get(i))) continue loop;
}
} else {
boolean nok = true;
flagtest: for (int i = 0; i < 32; i++) {
if ((filter.get(i)) && (ientry.flags().get(i))) {nok = false; break flagtest;}
}
if (nok) continue loop;
}
}
// increase flag counts
for (int i = 0; i < 32; i++) {
if (ientry.flags().get(i)) {c[i]++;}
}
// load url
if (loadurl) {
uentry = loadedURL.load(ientry.urlHash(), null);
if (uentry == null) {
mi.add(ientry.urlHash());
} else {
if (sortorder == 0) {
tm.put(uentry.comp().url().toNormalform(false, true), new Item(ientry, uentry));
}
if (sortorder == 1) {
tm.put(ientry.urlHash(), new Item(ientry, uentry));
}
}
} else {
indexes.add(new Item(ientry, null));
}
if ((maxcount > 0) && (mi.size() + tm.size() > maxcount)) break loop;
}
for (int i = 0; i < 32; i++) {c[i] = 0;}
if ((index == null) || (index.size() == 0)) {
return new Finding(mi.iterator(), mi.iterator(), mi, 0, c);
}
if (loadurl) {
return new Finding(tm.values().iterator(), mi, tm.size(), c);
if (sortorder == 2) {
plasmaSearchRankingProcess process = new plasmaSearchRankingProcess(query, null, ranking, query.neededResults());
process.insert(index, true);
return new Finding(process.entries(this, true), null, mi, process.filteredCount(), process.flagCount());
} else {
return new Finding(indexes.iterator(), mi, indexes.size(), c);
final TreeMap tm = new TreeMap();
final ArrayList indexes = new ArrayList();
final Iterator en = index.entries();
// generate a new map where the urls are sorted (not by hash but by the url text)
loop: while (en.hasNext()) {
ientry = (indexRWIEntry) en.next();
// test if ientry matches with filter
if (filter != null) {
// if all = true: let only entries pass that has all matching bits
// if all = false: let all entries pass that has at least one matching bit
if (query.allofconstraint) {
for (int i = 0; i < 32; i++) {
if ((filter.get(i)) && (!ientry.flags().get(i))) continue loop;
}
} else {
boolean nok = true;
flagtest: for (int i = 0; i < 32; i++) {
if ((filter.get(i)) && (ientry.flags().get(i))) {nok = false; break flagtest;}
}
if (nok) continue loop;
}
}
// increase flag counts
for (int i = 0; i < 32; i++) {
if (ientry.flags().get(i)) {c[i]++;}
}
// load url
if (loadurl) {
uentry = loadedURL.load(ientry.urlHash(), ientry, 0);
if (uentry == null) {
mi.add(ientry.urlHash());
} else {
if (sortorder == 0) {
tm.put(uentry.comp().url().toNormalform(false, true), uentry);
}
if (sortorder == 1) {
tm.put(ientry.urlHash(), uentry);
}
}
} else {
indexes.add(ientry);
}
if ((query.neededResults() > 0) && (mi.size() + tm.size() > query.neededResults())) break loop;
} // end loop
if (loadurl) {
return new Finding(tm.values().iterator(), null, mi, tm.size(), c);
} else {
return new Finding(null, indexes.iterator(), mi, indexes.size(), c);
}
}
}
public class Finding {
private Iterator items; // an iterator if Items objects
public static class Finding {
private Iterator urls; // an iterator if indexURLEntry objects
private Iterator rwientries; // an iterator of indexRWIEntry objects
private Set misses; // a set of hashes where we did not found items
private int findcount;
private int[] flagcount;
public Finding(Iterator items, Set misses, int findcount, int[] flagcount) {
public Finding(Iterator urls, Iterator rwientries, Set misses, int findcount, int[] flagcount) {
this.findcount = findcount;
this.items = items;
this.urls = urls;
this.rwientries = rwientries;
this.misses = misses;
this.flagcount = flagcount;
}
public int size() {
return this.findcount;
}
public Iterator hit() {
return this.items;
public Iterator urls() {
return this.urls;
}
public Iterator rwientries() {
return this.rwientries;
}
public Set miss() {
return this.misses;
@ -481,28 +498,6 @@ public final class plasmaWordIndex implements indexRI {
}
}
public class Item {
private indexRWIEntry ientry;
private indexURLEntry uentry;
public Item() {
ientry = null;
uentry = null;
}
public Item(indexRWIEntry ientry, indexURLEntry uentry) {
this.ientry = ientry;
this.uentry = uentry;
}
public boolean found() {
return (ientry != null) && (uentry != null);
}
public indexRWIEntry index() {
return this.ientry;
}
public indexURLEntry url() {
return this.uentry;
}
}
public int size() {
return java.lang.Math.max(collections.size(), java.lang.Math.max(dhtInCache.size(), dhtOutCache.size()));
}
@ -712,7 +707,7 @@ public final class plasmaWordIndex implements indexRI {
entry = (indexRWIEntry) containerIterator.next();
// System.out.println("Wordhash: "+wordHash+" UrlHash:
// "+entry.getUrlHash());
indexURLEntry ue = lurl.load(entry.urlHash(), null);
indexURLEntry ue = lurl.load(entry.urlHash(), entry, 0);
if (ue == null) {
urlHashs.add(entry.urlHash());
} else {

@ -991,10 +991,11 @@ public class yacyURL {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(subdom + ":" + port + ":" + rootpath)).charAt(0);
}
private static final char rootURLFlag = subdomPortPath("www", 80, "");
private static final char rootURLFlag0 = subdomPortPath("", 80, "");
private static final char rootURLFlag1 = subdomPortPath("www", 80, "");
public static final boolean probablyRootURL(String urlHash) {
return (urlHash.charAt(5) == rootURLFlag);
return (urlHash.charAt(5) == rootURLFlag0) || (urlHash.charAt(5) == rootURLFlag1);
}
private static String protocolHostPort(String protocol, String host, int port) {

@ -630,7 +630,7 @@ public final class yacy {
iEntry = (indexRWIEntry) wordIdxEntries.next();
String urlHash = iEntry.urlHash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
indexURLEntry urlEntry = currentUrlDB.load(urlHash, null);
indexURLEntry urlEntry = currentUrlDB.load(urlHash, null, 0);
urlCounter++;
minimizedUrlDB.store(urlEntry);
if (urlCounter % 500 == 0) {

Loading…
Cancel
Save