- enhanced monitoring of ranking parameters

for details, please try http://localhost:8080/IndexControlRWIs_p.html
- fixed computation of ranking ordering in some cases

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4220 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent bd5673efbe
commit c527969185

@ -169,7 +169,7 @@ public class Bookmarks {
bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.getBookmark(urlHash); bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.getBookmark(urlHash);
if (bookmark == null) { if (bookmark == null) {
// try to get the bookmark from the LURL database // try to get the bookmark from the LURL database
indexURLEntry urlentry = switchboard.wordIndex.loadedURL.load(urlHash, null); indexURLEntry urlentry = switchboard.wordIndex.loadedURL.load(urlHash, null, 0);
plasmaParserDocument document = null; plasmaParserDocument document = null;
if (urlentry != null) { if (urlentry != null) {
indexURLEntry.Components comp = urlentry.comp(); indexURLEntry.Components comp = urlentry.comp();

@ -167,7 +167,7 @@ public class CrawlResults {
urlHash = sb.wordIndex.loadedURL.getUrlHash(tabletype, i); urlHash = sb.wordIndex.loadedURL.getUrlHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash);
try { try {
urle = sb.wordIndex.loadedURL.load(urlHash, null); urle = sb.wordIndex.loadedURL.load(urlHash, null, 0);
indexURLEntry.Components comp = urle.comp(); indexURLEntry.Components comp = urle.comp();
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString()); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString());
initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash); initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash);

@ -92,9 +92,9 @@
<input type="radio" name="lines" value="1000" />1000&nbsp;&nbsp; <input type="radio" name="lines" value="1000" />1000&nbsp;&nbsp;
</dd> </dd>
<dt class="TableCellDark">Ordering of list:</dt> <dt class="TableCellDark">Ordering of list:</dt>
<dd><input type="radio" name="ordering" value="0" checked="checked" />by URL&nbsp;&nbsp; <dd><input type="radio" name="ordering" value="2" checked="checked" />by Ranking&nbsp;&nbsp;
<input type="radio" name="ordering" value="0"/>by URL&nbsp;&nbsp;
<input type="radio" name="ordering" value="1" />by URL Hash&nbsp;&nbsp; <input type="radio" name="ordering" value="1" />by URL Hash&nbsp;&nbsp;
<!-- <input type="radio" name="ordering" value="2" />by Ranking&nbsp;&nbsp;-->
</dd> </dd>
<dt class="TableCellLight"></dt> <dt class="TableCellLight"></dt>
<dd><input type="submit" name="urllist" value="List Selected URLs" /> <dd><input type="submit" name="urllist" value="List Selected URLs" />
@ -134,30 +134,61 @@
<form action="IndexControlRWIs_p.html" method="post" enctype="multipart/form-data"> <form action="IndexControlRWIs_p.html" method="post" enctype="multipart/form-data">
<p> <p>
<table border="0" cellpadding="2" cellspacing="1"> <table border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader"><td>&nbsp;</td><td>hash</td><td>url</td><td>pos</td><td>phrase</td><td>urlcomps</td><td>urllength</td><td width="60%">props</td></tr> <tr class="TableHeader">
<td colspan="3">Resource</td>
<td colspan="8">Negative Ranking Factors</td>
<td colspan="7">Positive Ranking Factors</td>
<td rowspan="2">Reverse Normalized Weighted Ranking Sum</td>
</tr>
<tr class="TableHeader">
<td>&nbsp;</td>
<td>hash</td>
<td>url</td>
<td>dom length</td>
<td>ybr</td>
<td>url comps</td>
<td>url length</td>
<td>pos in text</td>
<td>pos of phrase</td>
<td>pos in phrase</td>
<td>word distance</td>
<td>date</td>
<td>words in title</td>
<td>words in text</td>
<td>local links</td>
<td>remote links</td>
<td>hitcount</td>
<td>props</td>
<td></td>
</tr>
#{urlList}# #{urlList}#
<tr class="TableCellLight"> <tr class="TableCellLight">
#(urlExists)# #(urlExists)#
<td><input type="checkbox" id="urlhx.#[urlhxCount]#" name="urlhx.#[urlhxCount]#" checked value="#[urlhxValue]#" align="top" /> <td class="TableCellDark"><input type="checkbox" id="urlhx.#[urlhxCount]#" name="urlhx.#[urlhxCount]#" checked value="#[urlhxValue]#" align="top" />
<label for="urlhx.#[urlhxCount]#" class="tt">#[urlhxValue]#</label></td> <label for="urlhx.#[urlhxCount]#" class="tt">#[urlhxValue]#</label></td>
<td>&lt;unresolved URL Hash&gt;</td> <td>&lt;unresolved URL Hash&gt;</td>
<td></td> <td colspan="15"></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</span>
:: ::
<td><input type="checkbox" id="urlhx.#[urlhxCount]#" name="urlhx.#[urlhxCount]#" #(urlhxChecked)#::checked="checked" #(/urlhxChecked)#value="#[urlhxValue]#" align="top" /> <td><input type="checkbox" id="urlhx.#[urlhxCount]#" name="urlhx.#[urlhxCount]#" #(urlhxChecked)#::checked="checked" #(/urlhxChecked)#value="#[urlhxValue]#" align="top" />
<label for="urlhx.#[urlhxCount]#" class="tt"></label></td> <label for="urlhx.#[urlhxCount]#" class="tt"></label></td>
<td><a href="/IndexControlURLs_p.html?keystring=#[keyString]#&amp;keyhash=#[keyHash]#&amp;urlhash=#[urlhxValue]#&amp;urlstringsearch=&amp;urlstring=#[urlString]#" class="tt">#[urlhxValue]#</a></td> <td><a href="/IndexControlURLs_p.html?keystring=#[keyString]#&amp;keyhash=#[keyHash]#&amp;urlhash=#[urlhxValue]#&amp;urlstringsearch=&amp;urlstring=#[urlString]#" class="tt">#[urlhxValue]#</a></td>
<td><a href="#[urlString]#">#[urlStringShort]#</a></td> <td><a href="#[urlString]#">#[urlStringShort]#</a></td>
<td>#[pos]#</td> <td class="TableCellDark">#[domlength]#</td>
<td>#[phrase]#</td> <td class="TableCellDark">#[ybr]#</td>
<td>#[urlcomps]#</td> <td class="TableCellDark">#[urlcomps]#</td>
<td>#[urllength]#</td> <td class="TableCellDark">#[urllength]#</td>
<td class="TableCellDark">#[pos]#</td>
<td class="TableCellDark">#[phrase]#</td>
<td class="TableCellDark">#[posinphrase]#</td>
<td class="TableCellDark">#[worddistance]#</td>
<td>#[date]#</td>
<td>#[wordsintitle]#</td>
<td>#[wordsintext]#</td>
<td>#[llocal]#</td>
<td>#[lother]#</td>
<td>#[hitcount]#</td>
<td>#[props]#</td> <td>#[props]#</td>
<td align="right" class="TableCellDark">#[ranking]#</td>
#(/urlExists)# #(/urlExists)#
</tr> </tr>
#{/urlList}# #{/urlList}#

@ -30,6 +30,7 @@ import java.io.FileWriter;
import java.io.IOException; import java.io.IOException;
import java.io.PrintWriter; import java.io.PrintWriter;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
@ -43,10 +44,13 @@ import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroBitfield;
import de.anomic.plasma.plasmaCondenser; import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProcess;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.urlPattern.abstractURLPattern; import de.anomic.plasma.urlPattern.abstractURLPattern;
import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverDate;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyClient; import de.anomic.yacy.yacyClient;
@ -198,7 +202,7 @@ public class IndexControlRWIs_p {
indexURLEntry lurl; indexURLEntry lurl;
while (urlIter.hasNext()) { while (urlIter.hasNext()) {
iEntry = (indexRWIEntry) urlIter.next(); iEntry = (indexRWIEntry) urlIter.next();
lurl = sb.wordIndex.loadedURL.load(iEntry.urlHash(), null); lurl = sb.wordIndex.loadedURL.load(iEntry.urlHash(), null, 0);
if (lurl == null) { if (lurl == null) {
unknownURLEntries.add(iEntry.urlHash()); unknownURLEntries.add(iEntry.urlHash());
urlIter.remove(); urlIter.remove();
@ -255,7 +259,7 @@ public class IndexControlRWIs_p {
yacyURL url; yacyURL url;
for (int i=0; i<urlx.length; i++) { for (int i=0; i<urlx.length; i++) {
urlHashes.add(urlx[i]); urlHashes.add(urlx[i]);
indexURLEntry e = sb.wordIndex.loadedURL.load(urlx[i], null); indexURLEntry e = sb.wordIndex.loadedURL.load(urlx[i], null, 0);
sb.wordIndex.loadedURL.remove(urlx[i]); sb.wordIndex.loadedURL.remove(urlx[i]);
if (e != null) { if (e != null) {
url = e.comp().url(); url = e.comp().url();
@ -283,7 +287,7 @@ public class IndexControlRWIs_p {
yacyURL url; yacyURL url;
for (int i=0; i<urlx.length; i++) { for (int i=0; i<urlx.length; i++) {
urlHashes.add(urlx[i]); urlHashes.add(urlx[i]);
indexURLEntry e = sb.wordIndex.loadedURL.load(urlx[i], null); indexURLEntry e = sb.wordIndex.loadedURL.load(urlx[i], null, 0);
sb.wordIndex.loadedURL.remove(urlx[i]); sb.wordIndex.loadedURL.remove(urlx[i]);
if (e != null) { if (e != null) {
url = e.comp().url(); url = e.comp().url();
@ -357,7 +361,7 @@ public class IndexControlRWIs_p {
} }
private static plasmaWordIndex.Finding genSearchresult(serverObjects prop, plasmaSwitchboard sb, String keyhash, kelondroBitfield filter, boolean urlfetch, int sortorder) { private static plasmaWordIndex.Finding genSearchresult(serverObjects prop, plasmaSwitchboard sb, String keyhash, kelondroBitfield filter, boolean urlfetch, int sortorder) {
final plasmaWordIndex.Finding finding = sb.wordIndex.retrieveURLs(keyhash, filter, false, -1, urlfetch, sortorder); final plasmaWordIndex.Finding finding = sb.wordIndex.retrieveURLs(new plasmaSearchQuery(keyhash, -1, filter), urlfetch, sortorder, sb.getRanking());
if (finding.size() == 0) { if (finding.size() == 0) {
prop.put("searchresult", 2); prop.put("searchresult", 2);
prop.put("searchresult_wordhash", keyhash); prop.put("searchresult_wordhash", keyhash);
@ -395,37 +399,53 @@ public class IndexControlRWIs_p {
prop.put("genUrlList_ordering", ordering); prop.put("genUrlList_ordering", ordering);
int i = 0; int i = 0;
yacyURL url; yacyURL url;
Iterator iter = finding.hit(); Iterator iter = finding.urls();
plasmaWordIndex.Item entry; indexURLEntry entry;
String us; String us;
long rn = -1;
while (iter.hasNext()) { while (iter.hasNext()) {
entry = (plasmaWordIndex.Item) iter.next(); entry = (indexURLEntry) iter.next();
us = entry.url().comp().url().toNormalform(false, false); us = entry.comp().url().toNormalform(false, false);
if (rn == -1) rn = entry.ranking();
prop.put("genUrlList_urlList_"+i+"_urlExists", "1"); prop.put("genUrlList_urlList_"+i+"_urlExists", "1");
prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxCount", i); prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxCount", i);
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", entry.index().urlHash()); prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", entry.word().urlHash());
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_keyString", keystring); prop.putHTML("genUrlList_urlList_"+i+"_urlExists_keyString", keystring);
prop.put("genUrlList_urlList_"+i+"_urlExists_keyHash", keyhash); prop.put("genUrlList_urlList_"+i+"_urlExists_keyHash", keyhash);
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlString", us); prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlString", us);
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlStringShort", (us.length() > 60) ? (us.substring(0, 60) + "...") : us); prop.put("genUrlList_urlList_"+i+"_urlExists_urlStringShort", (us.length() > 40) ? (us.substring(0, 20) + "<br>" + us.substring(20, 40) + "...") : ((us.length() > 30) ? (us.substring(0, 20) + "<br>" + us.substring(20)) : us));
prop.put("genUrlList_urlList_"+i+"_urlExists_pos", entry.index().posintext()); prop.putNum("genUrlList_urlList_"+i+"_urlExists_ranking", (entry.ranking() - rn));
prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", entry.index().posofphrase()); prop.put("genUrlList_urlList_"+i+"_urlExists_domlength", yacyURL.domLengthEstimation(entry.hash()));
prop.put("genUrlList_urlList_"+i+"_urlExists_urlcomps", entry.index().urlcomps()); prop.put("genUrlList_urlList_"+i+"_urlExists_ybr", plasmaSearchRankingProcess.ybr(entry.hash()));
prop.put("genUrlList_urlList_"+i+"_urlExists_urllength", entry.index().urllength()); prop.put("genUrlList_urlList_"+i+"_urlExists_date", serverDate.shortDayTime(new Date(entry.word().lastModified())));
prop.put("genUrlList_urlList_"+i+"_urlExists_wordsintitle", entry.word().wordsintitle());
prop.put("genUrlList_urlList_"+i+"_urlExists_wordsintext", entry.word().wordsintext());
prop.put("genUrlList_urlList_"+i+"_urlExists_phrasesintext", entry.word().phrasesintext());
prop.put("genUrlList_urlList_"+i+"_urlExists_llocal", entry.word().llocal());
prop.put("genUrlList_urlList_"+i+"_urlExists_lother", entry.word().lother());
prop.put("genUrlList_urlList_"+i+"_urlExists_hitcount", entry.word().hitcount());
prop.put("genUrlList_urlList_"+i+"_urlExists_worddistance", entry.word().worddistance());
prop.put("genUrlList_urlList_"+i+"_urlExists_pos", entry.word().posintext());
prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", entry.word().posofphrase());
prop.put("genUrlList_urlList_"+i+"_urlExists_posinphrase", entry.word().posinphrase());
prop.put("genUrlList_urlList_"+i+"_urlExists_urlcomps", entry.word().urlcomps());
prop.put("genUrlList_urlList_"+i+"_urlExists_urllength", entry.word().urllength());
prop.put("genUrlList_urlList_"+i+"_urlExists_props", prop.put("genUrlList_urlList_"+i+"_urlExists_props",
((entry.index().flags().get(plasmaCondenser.flag_cat_hasimage)) ? "contains images, " : "") + ((entry.word().flags().get(plasmaCondenser.flag_cat_indexof)) ? "appears on index page, " : "") +
((entry.index().flags().get(plasmaCondenser.flag_cat_hasaudio)) ? "contains audio, " : "") + ((entry.word().flags().get(plasmaCondenser.flag_cat_hasimage)) ? "contains images, " : "") +
((entry.index().flags().get(plasmaCondenser.flag_cat_hasvideo)) ? "contains video, " : "") + ((entry.word().flags().get(plasmaCondenser.flag_cat_hasaudio)) ? "contains audio, " : "") +
((entry.index().flags().get(plasmaCondenser.flag_cat_hasapp)) ? "contains applications, " : "") + ((entry.word().flags().get(plasmaCondenser.flag_cat_hasvideo)) ? "contains video, " : "") +
((entry.index().flags().get(indexRWIEntry.flag_app_url)) ? "appears in url, " : "") + ((entry.word().flags().get(plasmaCondenser.flag_cat_hasapp)) ? "contains applications, " : "") +
((entry.index().flags().get(indexRWIEntry.flag_app_descr)) ? "appears in description, " : "") + ((entry.word().flags().get(indexRWIEntry.flag_app_url)) ? "appears in url, " : "") +
((entry.index().flags().get(indexRWIEntry.flag_app_author)) ? "appears in author, " : "") + ((entry.word().flags().get(indexRWIEntry.flag_app_descr)) ? "appears in description, " : "") +
((entry.index().flags().get(indexRWIEntry.flag_app_tags)) ? "appears in tags, " : "") + ((entry.word().flags().get(indexRWIEntry.flag_app_author)) ? "appears in author, " : "") +
((entry.index().flags().get(indexRWIEntry.flag_app_reference)) ? "appears in reference, " : "") + ((entry.word().flags().get(indexRWIEntry.flag_app_tags)) ? "appears in tags, " : "") +
((entry.index().flags().get(indexRWIEntry.flag_app_emphasized)) ? "appears emphasized" : "") ((entry.word().flags().get(indexRWIEntry.flag_app_reference)) ? "appears in reference, " : "") +
((entry.word().flags().get(indexRWIEntry.flag_app_emphasized)) ? "appears emphasized, " : "") +
((yacyURL.probablyRootURL(entry.word().urlHash())) ? "probably root url" : "")
); );
prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", entry.index().posofphrase()); prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", entry.word().posofphrase());
prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", entry.index().posofphrase()); prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", entry.word().posofphrase());
try { try {
url = new yacyURL(us, null); url = new yacyURL(us, null);
} catch (MalformedURLException e) { } catch (MalformedURLException e) {

@ -76,7 +76,7 @@ public class IndexControlURLs_p {
} }
if (post.containsKey("urlhashdelete")) { if (post.containsKey("urlhashdelete")) {
indexURLEntry entry = sb.wordIndex.loadedURL.load(urlhash, null); indexURLEntry entry = sb.wordIndex.loadedURL.load(urlhash, null, 0);
if (entry == null) { if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else { } else {
@ -106,7 +106,7 @@ public class IndexControlURLs_p {
yacyURL url = new yacyURL(urlstring, null); yacyURL url = new yacyURL(urlstring, null);
urlhash = url.hash(); urlhash = url.hash();
prop.put("urlhash", urlhash); prop.put("urlhash", urlhash);
indexURLEntry entry = sb.wordIndex.loadedURL.load(urlhash, null); indexURLEntry entry = sb.wordIndex.loadedURL.load(urlhash, null, 0);
if (entry == null) { if (entry == null) {
prop.putHTML("urlstring", "unknown url: " + urlstring); prop.putHTML("urlstring", "unknown url: " + urlstring);
prop.put("urlhash", ""); prop.put("urlhash", "");
@ -120,7 +120,7 @@ public class IndexControlURLs_p {
} }
if (post.containsKey("urlhashsearch")) { if (post.containsKey("urlhashsearch")) {
indexURLEntry entry = sb.wordIndex.loadedURL.load(urlhash, null); indexURLEntry entry = sb.wordIndex.loadedURL.load(urlhash, null, 0);
if (entry == null) { if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash); prop.put("result", "No Entry for URL hash " + urlhash);
} else { } else {
@ -172,7 +172,7 @@ public class IndexControlURLs_p {
} }
indexURLEntry.Components comp = entry.comp(); indexURLEntry.Components comp = entry.comp();
String referrer = null; String referrer = null;
indexURLEntry le = (entry.referrerHash() == null) ? null : switchboard.wordIndex.loadedURL.load(entry.referrerHash(), null); indexURLEntry le = (entry.referrerHash() == null) ? null : switchboard.wordIndex.loadedURL.load(entry.referrerHash(), null, 0);
if (le == null) { if (le == null) {
referrer = "<unknown>"; referrer = "<unknown>";
} else { } else {

@ -30,6 +30,7 @@ import java.util.Iterator;
import java.util.Map; import java.util.Map;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
@ -63,6 +64,7 @@ public class Ranking_p {
rankingParameters.put(plasmaSearchRankingProfile.PHRASESINTEXT, "Phrases In Text"); rankingParameters.put(plasmaSearchRankingProfile.PHRASESINTEXT, "Phrases In Text");
rankingParameters.put(plasmaSearchRankingProfile.POSINTEXT, "Position In Text"); rankingParameters.put(plasmaSearchRankingProfile.POSINTEXT, "Position In Text");
rankingParameters.put(plasmaSearchRankingProfile.POSOFPHRASE, "Position Of Phrase"); rankingParameters.put(plasmaSearchRankingProfile.POSOFPHRASE, "Position Of Phrase");
rankingParameters.put(plasmaSearchRankingProfile.POSINPHRASE, "Position In Phrase");
rankingParameters.put(plasmaSearchRankingProfile.PREFER, "Application Of Prefer Pattern"); rankingParameters.put(plasmaSearchRankingProfile.PREFER, "Application Of Prefer Pattern");
rankingParameters.put(plasmaSearchRankingProfile.URLCOMPINTOPLIST, "URL Component Appears In Toplist"); rankingParameters.put(plasmaSearchRankingProfile.URLCOMPINTOPLIST, "URL Component Appears In Toplist");
rankingParameters.put(plasmaSearchRankingProfile.URLCOMPS, "URL Components"); rankingParameters.put(plasmaSearchRankingProfile.URLCOMPS, "URL Components");
@ -127,6 +129,9 @@ public class Ranking_p {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
final plasmaSwitchboard sb = (plasmaSwitchboard) env; final plasmaSwitchboard sb = (plasmaSwitchboard) env;
// clean up all search events
plasmaSearchEvent.cleanupEvents(true);
// case if no values are requested // case if no values are requested
if ((post == null) || (env == null)) { if ((post == null) || (env == null)) {
// we create empty entries for template strings // we create empty entries for template strings

@ -109,7 +109,7 @@ public class ViewFile {
if (urlHash.length() > 0) { if (urlHash.length() > 0) {
// getting the urlEntry that belongs to the url hash // getting the urlEntry that belongs to the url hash
indexURLEntry urlEntry = null; indexURLEntry urlEntry = null;
urlEntry = sb.wordIndex.loadedURL.load(urlHash, null); urlEntry = sb.wordIndex.loadedURL.load(urlHash, null, 0);
if (urlEntry == null) { if (urlEntry == null) {
prop.put("error", "2"); prop.put("error", "2");
prop.put("viewMode",VIEW_MODE_NO_TEXT); prop.put("viewMode",VIEW_MODE_NO_TEXT);

@ -261,7 +261,7 @@ public final class crawlOrder {
reason = reasonString; reason = reasonString;
// send lurl-Entry as response // send lurl-Entry as response
indexURLEntry entry; indexURLEntry entry;
entry = switchboard.wordIndex.loadedURL.load(url.hash(), null); entry = switchboard.wordIndex.loadedURL.load(url.hash(), null, 0);
if (entry == null) { if (entry == null) {
response = "rejected"; response = "rejected";
lurl = ""; lurl = "";

@ -133,7 +133,7 @@ public final class search {
long urlRetrievalAllTime = 0, snippetComputationAllTime = 0; long urlRetrievalAllTime = 0, snippetComputationAllTime = 0;
if ((query.length() == 0) && (abstractSet != null)) { if ((query.length() == 0) && (abstractSet != null)) {
// this is _not_ a normal search, only a request for index abstracts // this is _not_ a normal search, only a request for index abstracts
theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, plasmaSearchQuery.catchall_constraint); theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, plasmaSearchQuery.catchall_constraint, false);
theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");
@ -162,7 +162,7 @@ public final class search {
} else { } else {
// retrieve index containers from search request // retrieve index containers from search request
theQuery = new plasmaSearchQuery(null, queryhashes, excludehashes, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, constraint); theQuery = new plasmaSearchQuery(null, queryhashes, excludehashes, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, constraint, false);
theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");

@ -169,7 +169,7 @@ public class yacysearch {
kelondroBitfield constraint = post.containsKey("constraint") ? new kelondroBitfield(4, post.get("constraint", "______")) : plasmaSearchQuery.catchall_constraint; kelondroBitfield constraint = post.containsKey("constraint") ? new kelondroBitfield(4, post.get("constraint", "______")) : plasmaSearchQuery.catchall_constraint;
if (indexof) { if (indexof) {
constraint = new kelondroBitfield(); constraint = new kelondroBitfield(4);
constraint.set(plasmaCondenser.flag_cat_indexof, true); constraint.set(plasmaCondenser.flag_cat_indexof, true);
} }
@ -225,7 +225,7 @@ public class yacysearch {
return prop; return prop;
} }
final String recommendHash = post.get("recommendref", ""); // urlhash final String recommendHash = post.get("recommendref", ""); // urlhash
indexURLEntry urlentry = sb.wordIndex.loadedURL.load(recommendHash, null); indexURLEntry urlentry = sb.wordIndex.loadedURL.load(recommendHash, null, 0);
if (urlentry != null) { if (urlentry != null) {
indexURLEntry.Components comp = urlentry.comp(); indexURLEntry.Components comp = urlentry.comp();
plasmaParserDocument document; plasmaParserDocument document;
@ -266,7 +266,8 @@ public class yacysearch {
((globalsearch) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL), ((globalsearch) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL),
"", "",
20, 20,
constraint); constraint,
false);
plasmaSearchProcessing localTiming = new plasmaSearchProcessing(4 * theQuery.maximumTime / 10, theQuery.displayResults()); plasmaSearchProcessing localTiming = new plasmaSearchProcessing(4 * theQuery.maximumTime / 10, theQuery.displayResults());
String client = (String) header.get("CLIENTIP"); // the search client who initiated the search String client = (String) header.get("CLIENTIP"); // the search client who initiated the search

@ -286,7 +286,7 @@ public class SitemapParser extends DefaultHandler {
String dbocc = this.switchboard.urlExists(nexturlhash); String dbocc = this.switchboard.urlExists(nexturlhash);
if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) { if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
// the url was already loaded. we need to check the date // the url was already loaded. we need to check the date
indexURLEntry oldEntry = this.switchboard.wordIndex.loadedURL.load(nexturlhash, null); indexURLEntry oldEntry = this.switchboard.wordIndex.loadedURL.load(nexturlhash, null, 0);
if (oldEntry != null) { if (oldEntry != null) {
Date modDate = oldEntry.moddate(); Date modDate = oldEntry.moddate();
// check if modDate is null // check if modDate is null

@ -234,6 +234,7 @@ public final class httpc {
// do NOT remove this check; in case that everything works fine this call does nothing // do NOT remove this check; in case that everything works fine this call does nothing
// but if in any arror case connections stay open, this will ensure that the peer keeps running and the host server is not blocked from working // but if in any arror case connections stay open, this will ensure that the peer keeps running and the host server is not blocked from working
checkIdleConnections(); checkIdleConnections();
assert timeout != 0;
// register new connection // register new connection
this.hashIndex = objCounter; this.hashIndex = objCounter;
@ -401,9 +402,10 @@ public final class httpc {
this.initTime = System.currentTimeMillis(); this.initTime = System.currentTimeMillis();
this.lastIO = System.currentTimeMillis(); this.lastIO = System.currentTimeMillis();
this.socket.setKeepAlive(false); this.socket.setKeepAlive(false);
this.socket.connect(address, timeout);
// setting socket timeout and keep alive behaviour // setting socket timeout and keep alive behaviour
this.socket.setSoTimeout(timeout); // waiting time for read this.socket.setSoTimeout(timeout); // waiting time for read
// get the connection
this.socket.connect(address, timeout);
if (incomingByteCountAccounting != null) { if (incomingByteCountAccounting != null) {
this.clientInputByteCount = new httpdByteCountInputStream(this.socket.getInputStream(),incomingByteCountAccounting); this.clientInputByteCount = new httpdByteCountInputStream(this.socket.getInputStream(),incomingByteCountAccounting);

@ -87,21 +87,22 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder implements kelondr
//return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords); //return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
// the normalizedEntry must be a normalized indexEntry // the normalizedEntry must be a normalized indexEntry
kelondroBitfield flags = t.flags(); kelondroBitfield flags = t.flags();
long r = ((255 - yacyURL.domLengthNormalized(t.urlHash())) << ranking.coeff_domlength) long r =
+ ((255 - (plasmaSearchRankingProcess.ybr(t.urlHash()) << 4 )) << ranking.coeff_ybr) ((256 - yacyURL.domLengthNormalized(t.urlHash())) << ranking.coeff_domlength)
+ ((255 - (((t.virtualAge() - min.virtualAge() ) << 8) / (1 + max.virtualAge() - min.virtualAge()) )) << ranking.coeff_date) + ((256 - (plasmaSearchRankingProcess.ybr(t.urlHash()) << 4)) << ranking.coeff_ybr)
+ ((t.urlcomps() == 0) ? 0 : ((256 - (((t.urlcomps() - min.urlcomps() ) << 8) / (1 + max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps))
+ ((t.urllength() == 0) ? 0 : ((256 - (((t.urllength() - min.urllength() ) << 8) / (1 + max.urllength() - min.urllength()) )) << ranking.coeff_urllength))
+ ((t.posintext() == 0) ? 0 : ((256 - (((t.posintext() - min.posintext() ) << 8) / (1 + max.posintext() - min.posintext()) )) << ranking.coeff_posintext))
+ ((t.posofphrase() == 0) ? 0 : ((256 - (((t.posofphrase() - min.posofphrase() ) << 8) / (1 + max.posofphrase() - min.posofphrase()) )) << ranking.coeff_posofphrase))
+ ((t.posinphrase() == 0) ? 0 : ((256 - (((t.posinphrase() - min.posinphrase() ) << 8) / (1 + max.posinphrase() - min.posinphrase()) )) << ranking.coeff_posinphrase))
+ ((256 - (((t.worddistance() - min.worddistance() ) << 8) / (1 + max.worddistance() - min.worddistance()) )) << ranking.coeff_worddistance)
+ ( (((t.virtualAge() - min.virtualAge() ) << 8) / (1 + max.virtualAge() - min.virtualAge()) ) << ranking.coeff_date)
+ ( (((t.wordsintitle() - min.wordsintitle() ) << 8) / (1 + max.wordsintitle() - min.wordsintitle()) ) << ranking.coeff_wordsintitle) + ( (((t.wordsintitle() - min.wordsintitle() ) << 8) / (1 + max.wordsintitle() - min.wordsintitle()) ) << ranking.coeff_wordsintitle)
+ ( (((t.wordsintext() - min.wordsintext() ) << 8) / (1 + max.wordsintext() - min.wordsintext()) ) << ranking.coeff_wordsintext) + ( (((t.wordsintext() - min.wordsintext() ) << 8) / (1 + max.wordsintext() - min.wordsintext()) ) << ranking.coeff_wordsintext)
+ ( (((t.phrasesintext()- min.phrasesintext()) << 8) / (1 + max.phrasesintext()- min.phrasesintext()) ) << ranking.coeff_phrasesintext) + ( (((t.phrasesintext()- min.phrasesintext()) << 8) / (1 + max.phrasesintext()- min.phrasesintext()) ) << ranking.coeff_phrasesintext)
+ ( (((t.llocal() - min.llocal() ) << 8) / (1 + max.llocal() - min.llocal()) ) << ranking.coeff_llocal) + ( (((t.llocal() - min.llocal() ) << 8) / (1 + max.llocal() - min.llocal()) ) << ranking.coeff_llocal)
+ ( (((t.lother() - min.lother() ) << 8) / (1 + max.lother() - min.lother()) ) << ranking.coeff_lother) + ( (((t.lother() - min.lother() ) << 8) / (1 + max.lother() - min.lother()) ) << ranking.coeff_lother)
+ ( (((t.hitcount() - min.hitcount() ) << 8) / (1 + max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount) + ( (((t.hitcount() - min.hitcount() ) << 8) / (1 + max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount)
+ ((255 - (((t.urllength() - min.urllength() ) << 8) / (1 + max.urllength() - min.urllength()) )) << ranking.coeff_urllength)
+ ((255 - (((t.urlcomps() - min.urlcomps() ) << 8) / (1 + max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps)
+ ((255 - (((t.posintext() - min.posintext() ) << 8) / (1 + max.posintext() - min.posintext()) )) << ranking.coeff_posintext)
+ ((255 - (((t.posofphrase() - min.posofphrase() ) << 8) / (1 + max.posofphrase() - min.posofphrase()) )) << ranking.coeff_posofphrase)
+ ((255 - (((t.posinphrase() - min.posinphrase() ) << 8) / (1 + max.posinphrase() - min.posinphrase()) )) << ranking.coeff_posinphrase)
+ ((255 - (((t.worddistance() - min.worddistance() ) << 8) / (1 + max.worddistance() - min.worddistance()) )) << ranking.coeff_worddistance)
+ (((flags.get(indexRWIEntry.flag_app_url)) ? 255 << ranking.coeff_appurl : 0)) + (((flags.get(indexRWIEntry.flag_app_url)) ? 255 << ranking.coeff_appurl : 0))
+ (((flags.get(indexRWIEntry.flag_app_descr)) ? 255 << ranking.coeff_appdescr : 0)) + (((flags.get(indexRWIEntry.flag_app_descr)) ? 255 << ranking.coeff_appdescr : 0))
+ (((flags.get(indexRWIEntry.flag_app_author)) ? 255 << ranking.coeff_appauthor : 0)) + (((flags.get(indexRWIEntry.flag_app_author)) ? 255 << ranking.coeff_appauthor : 0))

@ -117,6 +117,7 @@ public class indexURLEntry {
private kelondroRow.Entry entry; private kelondroRow.Entry entry;
private String snippet; private String snippet;
private indexRWIEntry word; // this is only used if the url is transported via remote search requests private indexRWIEntry word; // this is only used if the url is transported via remote search requests
private long ranking; // during generation of a search result this value is set
public indexURLEntry( public indexURLEntry(
yacyURL url, yacyURL url,
@ -163,6 +164,7 @@ public class indexURLEntry {
//System.out.println("===DEBUG=== " + load.toString() + ", " + decodeDate(col_load).toString()); //System.out.println("===DEBUG=== " + load.toString() + ", " + decodeDate(col_load).toString());
this.snippet = null; this.snippet = null;
this.word = null; this.word = null;
this.ranking = 0;
} }
private void encodeDate(int col, Date d) { private void encodeDate(int col, Date d) {
@ -184,10 +186,11 @@ public class indexURLEntry {
return s.toString().getBytes(); return s.toString().getBytes();
} }
public indexURLEntry(kelondroRow.Entry entry, indexRWIEntry searchedWord) { public indexURLEntry(kelondroRow.Entry entry, indexRWIEntry searchedWord, long ranking) {
this.entry = entry; this.entry = entry;
this.snippet = null; this.snippet = null;
this.word = searchedWord; this.word = searchedWord;
this.ranking = ranking;
} }
public indexURLEntry(Properties prop){ public indexURLEntry(Properties prop){
@ -243,6 +246,7 @@ public class indexURLEntry {
if (prop.containsKey("wi")) { if (prop.containsKey("wi")) {
this.word = new indexRWIRowEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""))); this.word = new indexRWIRowEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("wi", "")));
} }
this.ranking = 0;
} }
private StringBuffer corePropList() { private StringBuffer corePropList() {
@ -301,6 +305,10 @@ public class indexURLEntry {
return this.entry.getColString(col_hash, null); return this.entry.getColString(col_hash, null);
} }
public long ranking() {
return this.ranking;
}
public indexURLEntry.Components comp() { public indexURLEntry.Components comp() {
ArrayList cl = nxTools.strings(this.entry.getCol("comp", null), "UTF-8"); ArrayList cl = nxTools.strings(this.entry.getCol("comp", null), "UTF-8");
return new indexURLEntry.Components( return new indexURLEntry.Components(

@ -413,6 +413,7 @@ public class kelondroRowCollection {
if (this.chunkcount < isortlimit) { if (this.chunkcount < isortlimit) {
isort(0, this.chunkcount, new byte[this.rowdef.objectsize]); isort(0, this.chunkcount, new byte[this.rowdef.objectsize]);
this.sortBound = this.chunkcount; this.sortBound = this.chunkcount;
assert this.isSorted();
return; return;
} }
byte[] swapspace = new byte[this.rowdef.objectsize]; byte[] swapspace = new byte[this.rowdef.objectsize];
@ -555,6 +556,7 @@ public class kelondroRowCollection {
public synchronized boolean isSorted() { public synchronized boolean isSorted() {
assert (this.rowdef.objectOrder != null); assert (this.rowdef.objectOrder != null);
if (chunkcount <= 1) return true; if (chunkcount <= 1) return true;
if (chunkcount != this.sortBound) return false;
for (int i = 0; i < chunkcount - 1; i++) { for (int i = 0; i < chunkcount - 1; i++) {
//System.out.println("*" + new String(get(i).getColBytes(0))); //System.out.println("*" + new String(get(i).getColBytes(0)));
if (compare(i, i + 1) > 0) { if (compare(i, i + 1) > 0) {

@ -193,7 +193,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
// we need to import the url // we need to import the url
// getting the url entry // getting the url entry
indexURLEntry urlEntry = this.importWordIndex.loadedURL.load(urlHash, null); indexURLEntry urlEntry = this.importWordIndex.loadedURL.load(urlHash, null, 0);
if (urlEntry != null) { if (urlEntry != null) {
/* write it into the home url db */ /* write it into the home url db */

@ -153,7 +153,7 @@ public final class plasmaCrawlLURL {
return 0; return 0;
} }
public synchronized indexURLEntry load(String urlHash, indexRWIEntry searchedWord) { public synchronized indexURLEntry load(String urlHash, indexRWIEntry searchedWord, long ranking) {
// generates an plasmaLURLEntry using the url hash // generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered // to speed up the access, the url-hashes are buffered
// in the hash cache. // in the hash cache.
@ -165,7 +165,7 @@ public final class plasmaCrawlLURL {
try { try {
kelondroRow.Entry entry = urlIndexFile.get(urlHash.getBytes()); kelondroRow.Entry entry = urlIndexFile.get(urlHash.getBytes());
if (entry == null) return null; if (entry == null) return null;
return new indexURLEntry(entry, searchedWord); return new indexURLEntry(entry, searchedWord, ranking);
} catch (IOException e) { } catch (IOException e) {
return null; return null;
} }
@ -176,7 +176,7 @@ public final class plasmaCrawlLURL {
indexURLEntry oldEntry; indexURLEntry oldEntry;
try { try {
if (exists(entry.hash())) { if (exists(entry.hash())) {
oldEntry = load(entry.hash(), null); oldEntry = load(entry.hash(), null, 0);
} else { } else {
oldEntry = null; oldEntry = null;
} }
@ -342,7 +342,7 @@ public final class plasmaCrawlLURL {
if (this.iter == null) { return null; } if (this.iter == null) { return null; }
if (this.iter.hasNext()) { e = (kelondroRow.Entry) this.iter.next(); } if (this.iter.hasNext()) { e = (kelondroRow.Entry) this.iter.next(); }
if (e == null) { return null; } if (e == null) { return null; }
return new indexURLEntry(e, null); return new indexURLEntry(e, null, 0);
} }
public final void remove() { public final void remove() {

@ -455,7 +455,7 @@ public final class plasmaCrawlStacker extends Thread {
// check if the url is double registered // check if the url is double registered
String dbocc = sb.crawlQueues.urlExists(entry.url().hash()); String dbocc = sb.crawlQueues.urlExists(entry.url().hash());
indexURLEntry oldEntry = this.sb.wordIndex.loadedURL.load(entry.url().hash(), null); indexURLEntry oldEntry = this.sb.wordIndex.loadedURL.load(entry.url().hash(), null, 0);
boolean recrawl = (oldEntry != null) && ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) > profile.recrawlIfOlder()); boolean recrawl = (oldEntry != null) && ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) > profile.recrawlIfOlder());
// apply recrawl rule // apply recrawl rule
if ((dbocc != null) && (!(recrawl))) { if ((dbocc != null) && (!(recrawl))) {

@ -8,7 +8,7 @@
// //
// $LastChangedDate$ // $LastChangedDate$
// $LastChangedRevision$ // $LastChangedRevision$
// $LastChangedBy: $ // $LastChangedBy$
// //
// This program is free software; you can redistribute it and/or modify // This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by // it under the terms of the GNU General Public License as published by
@ -247,7 +247,7 @@ public class plasmaDHTChunk {
urlIter.remove(); urlIter.remove();
continue; continue;
} }
lurl = wordIndex.loadedURL.load(iEntry.urlHash(), iEntry); lurl = wordIndex.loadedURL.load(iEntry.urlHash(), iEntry, 0);
if ((lurl == null) || (lurl.comp() == null) || (lurl.comp().url() == null)) { if ((lurl == null) || (lurl.comp() == null) || (lurl.comp().url() == null)) {
//yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash()); //yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash());
notBoundCounter++; notBoundCounter++;

@ -205,22 +205,15 @@ public final class plasmaSearchEvent {
} else { } else {
// prepare result vector directly without worker threads // prepare result vector directly without worker threads
process.startTimer(); process.startTimer();
indexRWIEntry entry; indexURLEntry uentry;
indexURLEntry page;
ResultEntry resultEntry; ResultEntry resultEntry;
synchronized (rankedCache) { synchronized (rankedCache) {
Iterator indexRWIEntryIterator = rankedCache.entries(); Iterator urlIterator = rankedCache.entries(wordIndex, true);
while ((indexRWIEntryIterator.hasNext()) && (resultList.size() < (query.neededResults()))) { while ((urlIterator.hasNext()) && (resultList.size() < (query.neededResults()))) {
// fetch next entry // fetch next entry
entry = (indexRWIEntry) indexRWIEntryIterator.next(); uentry = (indexURLEntry) urlIterator.next();
page = wordIndex.loadedURL.load(entry.urlHash(), entry);
if (page == null) {
registerFailure(entry.urlHash(), "url does not exist in lurl-db");
continue;
}
resultEntry = obtainResultEntry(page, (snippetComputationAllTime < 300) ? 1 : 0); resultEntry = obtainResultEntry(uentry, (snippetComputationAllTime < 300) ? 1 : 0);
if (resultEntry == null) continue; // the entry had some problems, cannot be used if (resultEntry == null) continue; // the entry had some problems, cannot be used
urlRetrievalAllTime += resultEntry.dbRetrievalTime; urlRetrievalAllTime += resultEntry.dbRetrievalTime;
snippetComputationAllTime += resultEntry.snippetComputationTime; snippetComputationAllTime += resultEntry.snippetComputationTime;
@ -581,7 +574,7 @@ public final class plasmaSearchEvent {
} }
} }
indexURLEntry page = wordIndex.loadedURL.load(entry.urlHash(), entry); indexURLEntry page = wordIndex.loadedURL.load(entry.urlHash(), entry, 0);
if (page == null) { if (page == null) {
registerFailure(entry.urlHash(), "url does not exist in lurl-db"); registerFailure(entry.urlHash(), "url does not exist in lurl-db");
continue; continue;
@ -609,7 +602,7 @@ public final class plasmaSearchEvent {
private indexRWIEntry nextOrder() { private indexRWIEntry nextOrder() {
synchronized (rankedCache) { synchronized (rankedCache) {
Iterator i = rankedCache.entries(); Iterator i = rankedCache.entries(null, false);
indexRWIEntry entry; indexRWIEntry entry;
String urlhash; String urlhash;
while (i.hasNext()) { while (i.hasNext()) {

@ -85,13 +85,41 @@ public final class plasmaSearchQuery {
public int domMaxTargets; public int domMaxTargets;
public int maxDistance; public int maxDistance;
public kelondroBitfield constraint; public kelondroBitfield constraint;
public boolean allofconstraint;
public boolean onlineSnippetFetch; public boolean onlineSnippetFetch;
public plasmaSearchQuery(String queryString, TreeSet queryHashes, TreeSet excludeHashes, int maxDistance, String prefer, int contentdom, public plasmaSearchQuery(String queryString, int lines, kelondroBitfield constraint) {
if ((queryString.length() == 12) && (kelondroBase64Order.enhancedCoder.wellformed(queryString.getBytes()))) {
this.queryString = null;
this.queryHashes = new TreeSet();
this.excludeHashes = new TreeSet();
this.queryHashes.add(queryString);
} else {
this.queryString = queryString;
TreeSet[] cq = cleanQuery(queryString);
this.queryHashes = plasmaCondenser.words2hashes(cq[0]);
this.excludeHashes = plasmaCondenser.words2hashes(cq[1]);
}
this.maxDistance = Integer.MAX_VALUE;
this.prefer = "";
this.contentdom = CONTENTDOM_ALL;
this.linesPerPage = lines;
this.offset = 0;
this.maximumTime = 10000;
this.urlMask = ".*";
this.domType = SEARCHDOM_LOCAL;
this.domGroupName = "";
this.domMaxTargets = 0;
this.constraint = constraint;
this.allofconstraint = false;
this.onlineSnippetFetch = false;
}
public plasmaSearchQuery(String queryString, TreeSet queryHashes, TreeSet excludeHashes, int maxDistance, String prefer, int contentdom,
boolean onlineSnippetFetch, boolean onlineSnippetFetch,
int lines, int offset, long maximumTime, String urlMask, int lines, int offset, long maximumTime, String urlMask,
int domType, String domGroupName, int domMaxTargets, int domType, String domGroupName, int domMaxTargets,
kelondroBitfield constraint) { kelondroBitfield constraint, boolean allofconstraint) {
this.queryString = queryString; this.queryString = queryString;
this.queryHashes = queryHashes; this.queryHashes = queryHashes;
this.excludeHashes = excludeHashes; this.excludeHashes = excludeHashes;
@ -106,6 +134,7 @@ public final class plasmaSearchQuery {
this.domGroupName = domGroupName; this.domGroupName = domGroupName;
this.domMaxTargets = domMaxTargets; this.domMaxTargets = domMaxTargets;
this.constraint = constraint; this.constraint = constraint;
this.allofconstraint = allofconstraint;
this.onlineSnippetFetch = onlineSnippetFetch; this.onlineSnippetFetch = onlineSnippetFetch;
} }

@ -61,6 +61,7 @@ public final class plasmaSearchRankingProcess {
private int globalcount; private int globalcount;
private HashMap urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) private HashMap urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic
private int[] c; // flag counter
public plasmaSearchRankingProcess(plasmaSearchQuery query, plasmaSearchProcessing process, plasmaSearchRankingProfile ranking, int maxentries) { public plasmaSearchRankingProcess(plasmaSearchQuery query, plasmaSearchProcessing process, plasmaSearchRankingProfile ranking, int maxentries) {
// we collect the urlhashes and construct a list with urlEntry objects // we collect the urlhashes and construct a list with urlEntry objects
@ -74,6 +75,8 @@ public final class plasmaSearchRankingProcess {
this.globalcount = 0; this.globalcount = 0;
this.urlhashes = new HashMap(); this.urlhashes = new HashMap();
this.ref = new kelondroMScoreCluster(); this.ref = new kelondroMScoreCluster();
c = new int[32];
for (int i = 0; i < 32; i++) {c[i] = 0;}
} }
public void insert(indexContainer container, boolean local) { public void insert(indexContainer container, boolean local) {
@ -83,12 +86,12 @@ public final class plasmaSearchRankingProcess {
assert (container != null); assert (container != null);
if (container.size() == 0) return; if (container.size() == 0) return;
process.startTimer(); if (process != null) process.startTimer();
if (this.order == null) { if (this.order == null) {
this.order = new indexRWIEntryOrder(ranking); this.order = new indexRWIEntryOrder(ranking);
} }
this.order.extend(container); this.order.extend(container);
process.yield("normalizing", container.size()); if (process != null) process.yield("normalizing", container.size());
/* /*
container.setOrdering(o, 0); container.setOrdering(o, 0);
@ -96,7 +99,7 @@ public final class plasmaSearchRankingProcess {
*/ */
// normalize entries and get ranking // normalize entries and get ranking
process.startTimer(); if (process != null) process.startTimer();
Iterator i = container.entries(); Iterator i = container.entries();
this.pageAcc = new TreeMap(); this.pageAcc = new TreeMap();
indexRWIEntry iEntry, l; indexRWIEntry iEntry, l;
@ -107,8 +110,14 @@ public final class plasmaSearchRankingProcess {
iEntry = (indexRWIEntry) i.next(); iEntry = (indexRWIEntry) i.next();
if (iEntry.urlHash().length() != container.row().primaryKeyLength) continue; if (iEntry.urlHash().length() != container.row().primaryKeyLength) continue;
// increase flag counts
for (int j = 0; j < 32; j++) {
if (iEntry.flags().get(j)) {c[j]++;}
}
// kick out entries that are too bad acording to current findings
r = new Long(order.cardinal(iEntry)); r = new Long(order.cardinal(iEntry));
if ((pageAcc.size() >= maxentries) && (r.longValue() > biggestEntry)) continue; if ((maxentries >= 0) && (pageAcc.size() >= maxentries) && (r.longValue() > biggestEntry)) continue;
// check constraints // check constraints
if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) && (!(iEntry.flags().allOf(query.constraint)))) continue; // filter out entries that do not match the search constraint if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) && (!(iEntry.flags().allOf(query.constraint)))) continue; // filter out entries that do not match the search constraint
@ -118,7 +127,7 @@ public final class plasmaSearchRankingProcess {
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue; if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue;
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue; if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue;
} }
if (pageAcc.size() < maxentries) { if ((maxentries < 0) || (pageAcc.size() < maxentries)) {
if (urlhashes.containsKey(iEntry.urlHash())) continue; if (urlhashes.containsKey(iEntry.urlHash())) continue;
while (pageAcc.containsKey(r)) r = new Long(r.longValue() + 1); while (pageAcc.containsKey(r)) r = new Long(r.longValue() + 1);
pageAcc.put(r, iEntry); pageAcc.put(r, iEntry);
@ -145,7 +154,38 @@ public final class plasmaSearchRankingProcess {
if (container.size() > query.neededResults()) remove(true, true); if (container.size() > query.neededResults()) remove(true, true);
process.yield(plasmaSearchProcessing.PRESORT, container.size()); if (process != null) process.yield(plasmaSearchProcessing.PRESORT, container.size());
}
public class rIterator implements Iterator {
boolean urls;
Iterator r;
plasmaWordIndex wi;
public rIterator(plasmaWordIndex wi, boolean fetchURLs) {
// if fetchURLs == true, this iterates indexURLEntry objects, otherwise it iterates indexRWIEntry objects
this.urls = fetchURLs;
this.r = pageAcc.entrySet().iterator();
this.wi = wi;
}
public boolean hasNext() {
return r.hasNext();
}
public Object next() {
Map.Entry entry = (Map.Entry) r.next();
indexRWIEntry ientry = (indexRWIEntry) entry.getValue();
if (urls) {
return wi.loadedURL.load(ientry.urlHash(), ientry, ((Long) entry.getKey()).longValue());
} else {
return ientry;
}
}
public void remove() {
throw new UnsupportedOperationException();
}
} }
public int size() { public int size() {
@ -153,6 +193,10 @@ public final class plasmaSearchRankingProcess {
return pageAcc.size(); return pageAcc.size();
} }
public int[] flagCount() {
return c;
}
public int filteredCount() { public int filteredCount() {
return this.filteredCount; return this.filteredCount;
} }
@ -170,9 +214,9 @@ public final class plasmaSearchRankingProcess {
return iEntry; return iEntry;
} }
public Iterator entries() { public Iterator entries(plasmaWordIndex wi, boolean fetchURLs) {
// returns an iterator of indexRWIEntry objects in the ranked order, best entry first // if fetchURLs == true, this iterates indexURLEntry objects, otherwise it iterates indexRWIEntry objects
return this.pageAcc.values().iterator(); return new rIterator(wi, fetchURLs);
} }
public Set getReferences(int count) { public Set getReferences(int count) {

@ -64,6 +64,7 @@ public class plasmaSearchRankingProfile {
public static final String HITCOUNT = "hitcount"; public static final String HITCOUNT = "hitcount";
public static final String POSINTEXT = "posintext"; public static final String POSINTEXT = "posintext";
public static final String POSOFPHRASE = "posofphrase"; public static final String POSOFPHRASE = "posofphrase";
public static final String POSINPHRASE = "posinphrase";
public static final String WORDDISTANCE = "worddistance"; public static final String WORDDISTANCE = "worddistance";
public static final String APPURL = "appurl"; public static final String APPURL = "appurl";
public static final String APPDESCR = "appdescr"; public static final String APPDESCR = "appdescr";
@ -154,6 +155,7 @@ public class plasmaSearchRankingProfile {
coeff_hitcount = parseMap(coeff, HITCOUNT, coeff_hitcount); coeff_hitcount = parseMap(coeff, HITCOUNT, coeff_hitcount);
coeff_posintext = parseMap(coeff, POSINTEXT, coeff_posintext); coeff_posintext = parseMap(coeff, POSINTEXT, coeff_posintext);
coeff_posofphrase = parseMap(coeff, POSOFPHRASE, coeff_posofphrase); coeff_posofphrase = parseMap(coeff, POSOFPHRASE, coeff_posofphrase);
coeff_posinphrase = parseMap(coeff, POSINPHRASE, coeff_posinphrase);
coeff_worddistance = parseMap(coeff, WORDDISTANCE, coeff_worddistance); coeff_worddistance = parseMap(coeff, WORDDISTANCE, coeff_worddistance);
coeff_appurl = parseMap(coeff, APPURL, coeff_appurl); coeff_appurl = parseMap(coeff, APPURL, coeff_appurl);
coeff_appdescr = parseMap(coeff, APPDESCR, coeff_appdescr); coeff_appdescr = parseMap(coeff, APPDESCR, coeff_appdescr);
@ -207,6 +209,7 @@ public class plasmaSearchRankingProfile {
ext.put(prefix + HITCOUNT, Integer.toString(coeff_hitcount)); ext.put(prefix + HITCOUNT, Integer.toString(coeff_hitcount));
ext.put(prefix + POSINTEXT, Integer.toString(coeff_posintext)); ext.put(prefix + POSINTEXT, Integer.toString(coeff_posintext));
ext.put(prefix + POSOFPHRASE, Integer.toString(coeff_posofphrase)); ext.put(prefix + POSOFPHRASE, Integer.toString(coeff_posofphrase));
ext.put(prefix + POSINPHRASE, Integer.toString(coeff_posinphrase));
ext.put(prefix + WORDDISTANCE, Integer.toString(coeff_worddistance)); ext.put(prefix + WORDDISTANCE, Integer.toString(coeff_worddistance));
ext.put(prefix + APPURL, Integer.toString(coeff_appurl)); ext.put(prefix + APPURL, Integer.toString(coeff_appurl));
ext.put(prefix + APPDESCR, Integer.toString(coeff_appdescr)); ext.put(prefix + APPDESCR, Integer.toString(coeff_appdescr));

@ -1499,7 +1499,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (urlhash.equals(yacyURL.dummyHash)) return null; if (urlhash.equals(yacyURL.dummyHash)) return null;
yacyURL ne = crawlQueues.getURL(urlhash); yacyURL ne = crawlQueues.getURL(urlhash);
if (ne != null) return ne; if (ne != null) return ne;
indexURLEntry le = wordIndex.loadedURL.load(urlhash, null); indexURLEntry le = wordIndex.loadedURL.load(urlhash, null, 0);
if (le != null) return le.comp().url(); if (le != null) return le.comp().url();
return null; return null;
} }
@ -2541,7 +2541,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// finally, delete the url entry // finally, delete the url entry
// determine the url string // determine the url string
indexURLEntry entry = wordIndex.loadedURL.load(urlhash, null); indexURLEntry entry = wordIndex.loadedURL.load(urlhash, null, 0);
if (entry == null) return 0; if (entry == null) return 0;
indexURLEntry.Components comp = entry.comp(); indexURLEntry.Components comp = entry.comp();
if (comp.url() == null) return 0; if (comp.url() == null) return 0;

@ -328,7 +328,7 @@ public class plasmaSwitchboardQueue {
public yacyURL referrerURL() { public yacyURL referrerURL() {
if (referrerURL == null) { if (referrerURL == null) {
if ((referrerHash == null) || (referrerHash.equals(yacyURL.dummyHash))) return null; if ((referrerHash == null) || (referrerHash.equals(yacyURL.dummyHash))) return null;
indexURLEntry entry = lurls.load(referrerHash, null); indexURLEntry entry = lurls.load(referrerHash, null, 0);
if (entry == null) referrerURL = null; else referrerURL = entry.comp().url(); if (entry == null) referrerURL = null; else referrerURL = entry.comp().url();
} }
return referrerURL; return referrerURL;

@ -389,22 +389,34 @@ public final class plasmaWordIndex implements indexRI {
return containers; return containers;
} }
public Finding retrieveURLs(String keyhash, kelondroBitfield filter, boolean all, int maxcount, boolean loadurl, int sortorder) { public Finding retrieveURLs(plasmaSearchQuery query, boolean loadurl, int sortorder, plasmaSearchRankingProfile ranking) {
// search for a word hash and generate a list of url links // search for a word hash and generate a list of url links
// sortorder: 0 = hash, 1 = url, 2 = ranking // sortorder: 0 = hash, 1 = url, 2 = ranking
indexContainer index = getContainer(keyhash, null); assert query.queryHashes.size() == 1;
final TreeMap tm = new TreeMap();
final TreeSet mi = new TreeSet(); final TreeSet mi = new TreeSet();
final ArrayList indexes = new ArrayList(); String keyhash = (String) query.queryHashes.first();
kelondroBitfield filter = query.constraint;
indexContainer index = getContainer(keyhash, null);
indexRWIEntry ientry;
indexURLEntry uentry;
final int[] c = new int[32]; final int[] c = new int[32];
for (int i = 0; i < 32; i++) {c[i] = 0;} for (int i = 0; i < 32; i++) {c[i] = 0;}
if ((index != null) && (index.size() != 0)) { if ((index == null) || (index.size() == 0)) {
return new Finding(mi.iterator(), mi.iterator(), mi, 0, c);
}
if (sortorder == 2) {
plasmaSearchRankingProcess process = new plasmaSearchRankingProcess(query, null, ranking, query.neededResults());
process.insert(index, true);
return new Finding(process.entries(this, true), null, mi, process.filteredCount(), process.flagCount());
} else {
final TreeMap tm = new TreeMap();
final ArrayList indexes = new ArrayList();
final Iterator en = index.entries(); final Iterator en = index.entries();
// generate a new map where the urls are sorted (not by hash but by the url text) // generate a new map where the urls are sorted (not by hash but by the url text)
indexRWIEntry ientry;
indexURLEntry uentry;
loop: while (en.hasNext()) { loop: while (en.hasNext()) {
ientry = (indexRWIEntry) en.next(); ientry = (indexRWIEntry) en.next();
@ -412,7 +424,7 @@ public final class plasmaWordIndex implements indexRI {
if (filter != null) { if (filter != null) {
// if all = true: let only entries pass that has all matching bits // if all = true: let only entries pass that has all matching bits
// if all = false: let all entries pass that has at least one matching bit // if all = false: let all entries pass that has at least one matching bit
if (all) { if (query.allofconstraint) {
for (int i = 0; i < 32; i++) { for (int i = 0; i < 32; i++) {
if ((filter.get(i)) && (!ientry.flags().get(i))) continue loop; if ((filter.get(i)) && (!ientry.flags().get(i))) continue loop;
} }
@ -432,46 +444,51 @@ public final class plasmaWordIndex implements indexRI {
// load url // load url
if (loadurl) { if (loadurl) {
uentry = loadedURL.load(ientry.urlHash(), null); uentry = loadedURL.load(ientry.urlHash(), ientry, 0);
if (uentry == null) { if (uentry == null) {
mi.add(ientry.urlHash()); mi.add(ientry.urlHash());
} else { } else {
if (sortorder == 0) { if (sortorder == 0) {
tm.put(uentry.comp().url().toNormalform(false, true), new Item(ientry, uentry)); tm.put(uentry.comp().url().toNormalform(false, true), uentry);
} }
if (sortorder == 1) { if (sortorder == 1) {
tm.put(ientry.urlHash(), new Item(ientry, uentry)); tm.put(ientry.urlHash(), uentry);
} }
} }
} else { } else {
indexes.add(new Item(ientry, null)); indexes.add(ientry);
}
if ((maxcount > 0) && (mi.size() + tm.size() > maxcount)) break loop;
}
} }
if ((query.neededResults() > 0) && (mi.size() + tm.size() > query.neededResults())) break loop;
} // end loop
if (loadurl) { if (loadurl) {
return new Finding(tm.values().iterator(), mi, tm.size(), c); return new Finding(tm.values().iterator(), null, mi, tm.size(), c);
} else { } else {
return new Finding(indexes.iterator(), mi, indexes.size(), c); return new Finding(null, indexes.iterator(), mi, indexes.size(), c);
}
} }
} }
public class Finding { public static class Finding {
private Iterator items; // an iterator if Items objects private Iterator urls; // an iterator if indexURLEntry objects
private Iterator rwientries; // an iterator of indexRWIEntry objects
private Set misses; // a set of hashes where we did not found items private Set misses; // a set of hashes where we did not found items
private int findcount; private int findcount;
private int[] flagcount; private int[] flagcount;
public Finding(Iterator items, Set misses, int findcount, int[] flagcount) { public Finding(Iterator urls, Iterator rwientries, Set misses, int findcount, int[] flagcount) {
this.findcount = findcount; this.findcount = findcount;
this.items = items; this.urls = urls;
this.rwientries = rwientries;
this.misses = misses; this.misses = misses;
this.flagcount = flagcount; this.flagcount = flagcount;
} }
public int size() { public int size() {
return this.findcount; return this.findcount;
} }
public Iterator hit() { public Iterator urls() {
return this.items; return this.urls;
}
public Iterator rwientries() {
return this.rwientries;
} }
public Set miss() { public Set miss() {
return this.misses; return this.misses;
@ -481,28 +498,6 @@ public final class plasmaWordIndex implements indexRI {
} }
} }
public class Item {
private indexRWIEntry ientry;
private indexURLEntry uentry;
public Item() {
ientry = null;
uentry = null;
}
public Item(indexRWIEntry ientry, indexURLEntry uentry) {
this.ientry = ientry;
this.uentry = uentry;
}
public boolean found() {
return (ientry != null) && (uentry != null);
}
public indexRWIEntry index() {
return this.ientry;
}
public indexURLEntry url() {
return this.uentry;
}
}
public int size() { public int size() {
return java.lang.Math.max(collections.size(), java.lang.Math.max(dhtInCache.size(), dhtOutCache.size())); return java.lang.Math.max(collections.size(), java.lang.Math.max(dhtInCache.size(), dhtOutCache.size()));
} }
@ -712,7 +707,7 @@ public final class plasmaWordIndex implements indexRI {
entry = (indexRWIEntry) containerIterator.next(); entry = (indexRWIEntry) containerIterator.next();
// System.out.println("Wordhash: "+wordHash+" UrlHash: // System.out.println("Wordhash: "+wordHash+" UrlHash:
// "+entry.getUrlHash()); // "+entry.getUrlHash());
indexURLEntry ue = lurl.load(entry.urlHash(), null); indexURLEntry ue = lurl.load(entry.urlHash(), entry, 0);
if (ue == null) { if (ue == null) {
urlHashs.add(entry.urlHash()); urlHashs.add(entry.urlHash());
} else { } else {

@ -991,10 +991,11 @@ public class yacyURL {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(subdom + ":" + port + ":" + rootpath)).charAt(0); return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(subdom + ":" + port + ":" + rootpath)).charAt(0);
} }
private static final char rootURLFlag = subdomPortPath("www", 80, ""); private static final char rootURLFlag0 = subdomPortPath("", 80, "");
private static final char rootURLFlag1 = subdomPortPath("www", 80, "");
public static final boolean probablyRootURL(String urlHash) { public static final boolean probablyRootURL(String urlHash) {
return (urlHash.charAt(5) == rootURLFlag); return (urlHash.charAt(5) == rootURLFlag0) || (urlHash.charAt(5) == rootURLFlag1);
} }
private static String protocolHostPort(String protocol, String host, int port) { private static String protocolHostPort(String protocol, String host, int port) {

@ -630,7 +630,7 @@ public final class yacy {
iEntry = (indexRWIEntry) wordIdxEntries.next(); iEntry = (indexRWIEntry) wordIdxEntries.next();
String urlHash = iEntry.urlHash(); String urlHash = iEntry.urlHash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try { if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
indexURLEntry urlEntry = currentUrlDB.load(urlHash, null); indexURLEntry urlEntry = currentUrlDB.load(urlHash, null, 0);
urlCounter++; urlCounter++;
minimizedUrlDB.store(urlEntry); minimizedUrlDB.store(urlEntry);
if (urlCounter % 500 == 0) { if (urlCounter % 500 == 0) {

Loading…
Cancel
Save