enhanced ranking

- redesign of data storage in plasmaSearchRankingProfile
- profiles are extended by new ranking parameters
- new RWI ranking parameters are considered during ranking
- appearance attributes (i.e. emphasised text) is now considered
- faster ranking
- some attributes that had been checked during post-ranking can now be
  checked during pre-ranking phase
- removed old ranking parameter on index.html page (will be replaced by profiles in the future)
- ranking can now consider appearances of media content
- snippet-loading for media types now work correctly (fetches only from the wanted media)
- ranking-profiles can be handed over the remote peers and apply there also
- re-search of same query with different domain now also re-triggers remote search

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3105 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent d0c32c6aeb
commit 0a050bc043

@ -3,7 +3,7 @@ javacSource=1.4
javacTarget=1.4
# Release Configuration
releaseVersion=0.494
releaseVersion=0.495
releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
#releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}

@ -47,10 +47,6 @@
<fieldset>
<legend>Local Pre-Ranking</legend>
<dl>
<dt>Entropy</dt>
<dd>
<script type="text/javascript">checkers("localentropy", #[localentropy]#)</script>
</dd>
<dt>Date</dt>
<dd>
<script type="text/javascript">checkers("localdate", #[localdate]#)</script>

@ -84,7 +84,7 @@ public class DetailedSearch {
prop.put("results", "");
prop.put("urlmaskoptions", 0);
prop.put("urlmaskoptions_urlmaskfilter", ".*");
String defaultRankingProfile = new plasmaSearchRankingProfile().toExternalString();
String defaultRankingProfile = new plasmaSearchRankingProfile("text").toExternalString();
prop.putAll(new plasmaSearchRankingProfile("", defaultRankingProfile).toExternalMap("local"));
return prop;
}

@ -29,7 +29,6 @@
<input type="radio" name="contentdom" value="app" #(contentdomCheckApp)#::checked="checked"#(/contentdomCheckApp)# />Applications&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
#(searchoptions)#
<input type="hidden" name="count" value="10" />
<input type="hidden" name="order" value="Date-YBR-Quality" />
<input type="hidden" name="resource" value="global" />
<input type="hidden" name="time" value="6" />
<input type="hidden" name="urlmaskfilter" value=".*" />
@ -52,19 +51,6 @@
</select>
</td>
</tr>
<tr>
<td>order by:</td>
<td>
<select name="order">
<option value="YBR-Date-Quality" #(order-ybr-date-quality)#::selected="selected"#(/order-ybr-date-quality)#>YBR-Date-Quality</option>
<option value="YBR-Quality-Date" #(order-ybr-quality-date)#::selected="selected"#(/order-ybr-quality-date)#>YBR-Quality-Date</option>
<option value="Date-YBR-Quality" #(order-date-ybr-quality)#::selected="selected"#(/order-date-ybr-quality)#>Date-YBR-Quality</option>
<option value="Quality-YBR-Date" #(order-quality-ybr-date)#::selected="selected"#(/order-quality-ybr-date)#>Quality-YBR-Date</option>
<option value="Date-Quality-YBR" #(order-date-quality-ybr)#::selected="selected"#(/order-date-quality-ybr)#>Date-Quality-YBR</option>
<option value="Quality-Date-YBR" #(order-quality-date-ybr)#::selected="selected"#(/order-quality-date-ybr)#>Quality-Date-YBR</option>
</select>
</td>
</tr>
<tr>
<td>Resource:</td>
<td>

@ -34,7 +34,6 @@ import java.util.HashMap;
import de.anomic.http.httpHeader;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaSearchPreOrder;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore;
@ -106,12 +105,6 @@ public class index {
prop.put("searchoptions_count-50", (count == 50) ? 1 : 0);
prop.put("searchoptions_count-100", (count == 100) ? 1 : 0);
prop.put("searchoptions_count-1000", (count == 1000) ? 1 : 0);
prop.put("searchoptions_order-ybr-date-quality", plasmaSearchPreOrder.canUseYBR() ? 1 : 0);
prop.put("searchoptions_order-ybr-quality-date", 0);
prop.put("searchoptions_order-date-ybr-quality", 0);
prop.put("searchoptions_order-quality-ybr-date", 0);
prop.put("searchoptions_order-date-quality-ybr", plasmaSearchPreOrder.canUseYBR() ? 0 : 1);
prop.put("searchoptions_order-quality-date-ybr", 0);
prop.put("searchoptions_resource-global", ((global) ? 1 : 0));
prop.put("searchoptions_resource-local", ((global) ? 0 : 1));
prop.put("searchoptions_time-1", (time == 1) ? 1 : 0);

@ -70,7 +70,7 @@ public class snippet {
prop.put("links", 0);
} else {
// attach media information
ArrayList mediaSnippets = switchboard.snippetCache.retrieveMediaSnippets(url, queryHashes, true, 1000);
ArrayList mediaSnippets = switchboard.snippetCache.retrieveMediaSnippets(url, queryHashes, media, true, 1000);
plasmaSnippetCache.MediaSnippet ms;
for (int i = 0; i < mediaSnippets.size(); i++) {
ms = (plasmaSnippetCache.MediaSnippet) mediaSnippets.get(i);
@ -79,7 +79,7 @@ public class snippet {
prop.put("link_" + i + "_name", ms.name);
prop.put("link_" + i + "_attr", ms.attr);
}
System.out.println("DEBUG: " + mediaSnippets.size() + " ENTRIES IN MEDIA SNIPPET LINKS for url " + urlString);
//System.out.println("DEBUG: " + mediaSnippets.size() + " ENTRIES IN MEDIA SNIPPET LINKS for url " + urlString);
prop.put("text", "");
prop.put("link", mediaSnippets.size());
prop.put("links", mediaSnippets.size());

@ -70,6 +70,7 @@ import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyDHTAction;
import de.anomic.yacy.yacySeed;
import de.anomic.tools.crypt;
public final class search {
@ -95,6 +96,8 @@ public final class search {
final String prefer = post.get("prefer", "");
final String contentdom = post.get("contentdom", "text");
final String filter = post.get("filter", ".*");
String profile = post.get("profile", ""); // remote profile hand-over
if (profile.length() > 0) profile = crypt.simpleDecode(profile, null);
final boolean includesnippet = post.get("includesnippet", "false").equals("true");
final kelondroBitfield constraint = new kelondroBitfield(4, post.get("constraint", "______"));
// final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers
@ -140,7 +143,7 @@ public final class search {
yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + squery.anonymizedQueryHashes() + " - " + squery.wantedResults + " links");
// prepare a search profile
plasmaSearchRankingProfile rankingProfile = new plasmaSearchRankingProfile(new String[]{plasmaSearchRankingProfile.ORDER_YBR, plasmaSearchRankingProfile.ORDER_DATE, plasmaSearchRankingProfile.ORDER_QUALITY});
plasmaSearchRankingProfile rankingProfile = (profile.length() == 0) ? new plasmaSearchRankingProfile(contentdom) : new plasmaSearchRankingProfile("", profile);
plasmaSearchTimingProfile localTiming = new plasmaSearchTimingProfile(squery.maximumTime, squery.wantedResults);
plasmaSearchTimingProfile remoteTiming = null;
@ -167,7 +170,7 @@ public final class search {
yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + squery.anonymizedQueryHashes() + " - " + squery.wantedResults + " links");
// prepare a search profile
plasmaSearchRankingProfile rankingProfile = new plasmaSearchRankingProfile(new String[]{plasmaSearchRankingProfile.ORDER_YBR, plasmaSearchRankingProfile.ORDER_DATE, plasmaSearchRankingProfile.ORDER_QUALITY});
plasmaSearchRankingProfile rankingProfile = (profile.length() == 0) ? new plasmaSearchRankingProfile(contentdom) : new plasmaSearchRankingProfile("", profile);
plasmaSearchTimingProfile localTiming = new plasmaSearchTimingProfile(squery.maximumTime, squery.wantedResults);
plasmaSearchTimingProfile remoteTiming = null;

@ -170,122 +170,105 @@ public class yacysearch {
if (!indexDistributeGranted || !indexReceiveGranted) { global = false; }
// find search domain
int contentdom = plasmaSearchQuery.CONTENTDOM_TEXT;
String cds = post.get("contentdom", "text");
if (cds.equals("text")) contentdom = plasmaSearchQuery.CONTENTDOM_TEXT;
if (cds.equals("audio")) contentdom = plasmaSearchQuery.CONTENTDOM_AUDIO;
if (cds.equals("video")) contentdom = plasmaSearchQuery.CONTENTDOM_VIDEO;
if (cds.equals("image")) contentdom = plasmaSearchQuery.CONTENTDOM_IMAGE;
if (cds.equals("app")) contentdom = plasmaSearchQuery.CONTENTDOM_APP;
int contentdomCode = plasmaSearchQuery.CONTENTDOM_TEXT;
String contentdomString = post.get("contentdom", "text");
if (contentdomString.equals("text")) contentdomCode = plasmaSearchQuery.CONTENTDOM_TEXT;
if (contentdomString.equals("audio")) contentdomCode = plasmaSearchQuery.CONTENTDOM_AUDIO;
if (contentdomString.equals("video")) contentdomCode = plasmaSearchQuery.CONTENTDOM_VIDEO;
if (contentdomString.equals("image")) contentdomCode = plasmaSearchQuery.CONTENTDOM_IMAGE;
if (contentdomString.equals("app")) contentdomCode = plasmaSearchQuery.CONTENTDOM_APP;
// patch until better search profiles are available
if ((contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) && (count <= 10)) count = 30;
if ((contentdomCode != plasmaSearchQuery.CONTENTDOM_TEXT) && (count <= 10)) count = 30;
serverObjects prop = new serverObjects();
if (post.get("cat", "href").equals("href")) {
final TreeSet query = plasmaSearchQuery.cleanQuery(querystring);
// filter out stopwords
final TreeSet filtered = kelondroMSetTools.joinConstructive(query,
plasmaSwitchboard.stopwords);
if (filtered.size() > 0) {
kelondroMSetTools.excludeDestructive(query, plasmaSwitchboard.stopwords);
}
final TreeSet query = plasmaSearchQuery.cleanQuery(querystring);
// filter out stopwords
final TreeSet filtered = kelondroMSetTools.joinConstructive(query, plasmaSwitchboard.stopwords);
if (filtered.size() > 0) {
kelondroMSetTools.excludeDestructive(query, plasmaSwitchboard.stopwords);
}
// if a minus-button was hit, remove a special reference first
if (post.containsKey("deleteref")) {
if (!sb.verifyAuthentication(header, true)) {
prop.put("AUTHENTICATE", "admin log-in"); // force log-in
return prop;
}
// delete the index entry locally
final String delHash = post.get("deleteref", ""); // urlhash
sb.wordIndex.removeReferences(query, delHash);
// make new news message with negative voting
HashMap map = new HashMap();
map.put("urlhash", delHash);
map.put("vote", "negative");
map.put("refid", "");
yacyCore.newsPool.publishMyNews(new yacyNewsRecord("stippavt", map));
// if a minus-button was hit, remove a special reference first
if (post.containsKey("deleteref")) {
if (!sb.verifyAuthentication(header, true)) {
prop.put("AUTHENTICATE", "admin log-in"); // force log-in
return prop;
}
// delete the index entry locally
final String delHash = post.get("deleteref", ""); // urlhash
sb.wordIndex.removeReferences(query, delHash);
// make new news message with negative voting
HashMap map = new HashMap();
map.put("urlhash", delHash);
map.put("vote", "negative");
map.put("refid", "");
yacyCore.newsPool.publishMyNews(new yacyNewsRecord("stippavt", map));
}
// if aplus-button was hit, create new voting message
if (post.containsKey("recommendref")) {
if (!sb.verifyAuthentication(header, true)) {
prop.put("AUTHENTICATE", "admin log-in"); // force log-in
return prop;
}
final String recommendHash = post.get("recommendref", ""); // urlhash
indexURLEntry urlentry = sb.wordIndex.loadedURL.load(recommendHash, null);
if (urlentry != null) {
indexURLEntry.Components comp = urlentry.comp();
plasmaParserDocument document;
document = sb.snippetCache.retrieveDocument(comp.url(), true, 5000, true);
if (document != null) {
// create a news message
HashMap map = new HashMap();
map.put("url", comp.url().toNormalform().replace(',', '|'));
map.put("title", comp.descr().replace(',', ' '));
map.put("description", ((document == null) ? comp.descr() : document.getMainLongTitle()).replace(',', ' '));
map.put("tags", ((document == null) ? "" : document.getKeywords(' ')));
yacyCore.newsPool.publishMyNews(new yacyNewsRecord("stippadd", map));
document.close();
}
// if aplus-button was hit, create new voting message
if (post.containsKey("recommendref")) {
if (!sb.verifyAuthentication(header, true)) {
prop.put("AUTHENTICATE", "admin log-in"); // force log-in
return prop;
}
final String recommendHash = post.get("recommendref", ""); // urlhash
indexURLEntry urlentry = sb.wordIndex.loadedURL.load(recommendHash, null);
if (urlentry != null) {
indexURLEntry.Components comp = urlentry.comp();
plasmaParserDocument document;
document = sb.snippetCache.retrieveDocument(comp.url(), true, 5000, true);
if (document != null) {
// create a news message
HashMap map = new HashMap();
map.put("url", comp.url().toNormalform().replace(',', '|'));
map.put("title", comp.descr().replace(',', ' '));
map.put("description", ((document == null) ? comp.descr() : document.getMainLongTitle()).replace(',', ' '));
map.put("tags", ((document == null) ? "" : document.getKeywords(' ')));
yacyCore.newsPool.publishMyNews(new yacyNewsRecord("stippadd", map));
document.close();
}
}
}
// prepare search order
final boolean yacyonline = ((yacyCore.seedDB != null) && (yacyCore.seedDB.mySeed != null) && (yacyCore.seedDB.mySeed.getAddress() != null));
String order1 = plasmaSearchRankingProfile.ORDER_DATE;
String order2 = plasmaSearchRankingProfile.ORDER_YBR;
String order3 = plasmaSearchRankingProfile.ORDER_QUALITY;
if (order.startsWith("YBR")) order1 = plasmaSearchRankingProfile.ORDER_YBR;
if (order.startsWith("Date")) order1 = plasmaSearchRankingProfile.ORDER_DATE;
if (order.startsWith("Quality")) order1 = plasmaSearchRankingProfile.ORDER_QUALITY;
if (order.indexOf("-YBR-") > 0) order2 = plasmaSearchRankingProfile.ORDER_YBR;
if (order.indexOf("-Date-") > 0) order2 = plasmaSearchRankingProfile.ORDER_DATE;
if (order.indexOf("-Quality-") > 0) order2 = plasmaSearchRankingProfile.ORDER_QUALITY;
if (order.endsWith("YBR")) order3 = plasmaSearchRankingProfile.ORDER_YBR;
if (order.endsWith("Date")) order3 = plasmaSearchRankingProfile.ORDER_DATE;
if (order.endsWith("Quality")) order3 = plasmaSearchRankingProfile.ORDER_QUALITY;
// do the search
plasmaSearchQuery thisSearch = new plasmaSearchQuery(
// prepare search properties
final boolean yacyonline = ((yacyCore.seedDB != null) && (yacyCore.seedDB.mySeed != null) && (yacyCore.seedDB.mySeed.getAddress() != null));
final boolean samesearch = env.getConfig("last-search", "").equals(querystring + contentdomString);
final boolean globalsearch = (global) && (yacyonline) && (!samesearch);
// do the search
plasmaSearchQuery thisSearch = new plasmaSearchQuery(
query,
maxDistance,
prefermask,
contentdom,
contentdomCode,
count,
searchtime,
urlmask,
((global) && (yacyonline) && (!(env.getConfig(
"last-search", "").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT
: plasmaSearchQuery.SEARCHDOM_LOCAL, "", 20, constraint);
plasmaSearchRankingProfile ranking = new plasmaSearchRankingProfile( new String[] { order1, order2, order3 });
plasmaSearchTimingProfile localTiming = new plasmaSearchTimingProfile(4 * thisSearch.maximumTime / 10, thisSearch.wantedResults);
plasmaSearchTimingProfile remoteTiming = new plasmaSearchTimingProfile(6 * thisSearch.maximumTime / 10, thisSearch.wantedResults);
prop = sb.searchFromLocal(thisSearch, ranking, localTiming, remoteTiming, true);
/*
* final serverObjects prop = sb.searchFromLocal(query, order1,
* order2, count, ((global) && (yacyonline) &&
* (!(env.getConfig("last-search","").equals(querystring)))),
* searchtime, urlmask);
*/
// remember the last search expression
env.setConfig("last-search", querystring);
// process result of search
prop.put("type_resultbottomline", 0);
if (filtered.size() > 0) {
prop.put("excluded", 1);
prop.put("excluded_stopwords", filtered.toString());
} else {
prop.put("excluded", 0);
}
(globalsearch) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL,
"",
20,
constraint);
plasmaSearchRankingProfile ranking = new plasmaSearchRankingProfile(contentdomString);
plasmaSearchTimingProfile localTiming = new plasmaSearchTimingProfile(4 * thisSearch.maximumTime / 10, thisSearch.wantedResults);
plasmaSearchTimingProfile remoteTiming = new plasmaSearchTimingProfile(6 * thisSearch.maximumTime / 10, thisSearch.wantedResults);
prop = sb.searchFromLocal(thisSearch, ranking, localTiming, remoteTiming, true);
// remember the last search expression
env.setConfig("last-search", querystring + contentdomString);
// process result of search
prop.put("type_resultbottomline", 0);
if (filtered.size() > 0) {
prop.put("excluded", 1);
prop.put("excluded_stopwords", filtered.toString());
} else {
prop.put("excluded", 0);
}
if (prop == null || prop.size() == 0) {
if (post.get("search", "").length() < 3) {
@ -364,7 +347,7 @@ public class yacysearch {
}
prop.put("type", (thisSearch.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) ? 0 : ((thisSearch.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) ? 2 : 1));
if (prop.getInt("type", 0) == 1) prop.put("type_mediatype", cds);
if (prop.getInt("type", 0) == 1) prop.put("type_mediatype", contentdomString);
prop.put("cat", "href");
prop.put("depth", "0");
@ -418,12 +401,12 @@ public class yacysearch {
prop.put("display", display);
prop.put("indexof", (indexof) ? "on" : "off");
prop.put("constraint", constraint.exportB64());
prop.put("contentdom", cds);
prop.put("contentdomCheckText", (contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) ? 1 : 0);
prop.put("contentdomCheckAudio", (contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) ? 1 : 0);
prop.put("contentdomCheckVideo", (contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) ? 1 : 0);
prop.put("contentdomCheckImage", (contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) ? 1 : 0);
prop.put("contentdomCheckApp", (contentdom == plasmaSearchQuery.CONTENTDOM_APP) ? 1 : 0);
prop.put("contentdom", contentdomString);
prop.put("contentdomCheckText", (contentdomCode == plasmaSearchQuery.CONTENTDOM_TEXT) ? 1 : 0);
prop.put("contentdomCheckAudio", (contentdomCode == plasmaSearchQuery.CONTENTDOM_AUDIO) ? 1 : 0);
prop.put("contentdomCheckVideo", (contentdomCode == plasmaSearchQuery.CONTENTDOM_VIDEO) ? 1 : 0);
prop.put("contentdomCheckImage", (contentdomCode == plasmaSearchQuery.CONTENTDOM_IMAGE) ? 1 : 0);
prop.put("contentdomCheckApp", (contentdomCode == plasmaSearchQuery.CONTENTDOM_APP) ? 1 : 0);
// return rewrite properties
return prop;

@ -552,8 +552,8 @@ public final class httpdFileHandler extends httpdAbstractHandler implements http
String mimeType = mimeTable.getProperty(targetExt, "text/html");
// generate an byte array from the generated image
int width = i.getWidth(null);
int height = i.getHeight(null);
int width = i.getWidth(null); if (width < 0) width = 96; // bad hack
int height = i.getHeight(null); if (height < 0) height = 96; // bad hack
BufferedImage bi = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
bi.createGraphics().drawImage(i, 0, 0, width, height, null);
serverByteBuffer baos = new serverByteBuffer();

@ -43,11 +43,16 @@ public interface indexRWIEntry {
public int posintext();
public int posinphrase();
public int posofphrase();
public int wordcount();
public int phrasecount();
public int wordsintext();
public int phrasesintext();
public String getLanguage();
public char getType();
public kelondroBitfield flags();
public int wordsintitle();
public int llocal();
public int lother();
public int urllength();
public int urlcomps();
public void combineDistance(indexRWIEntry oe);
public int worddistance();

@ -160,8 +160,8 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry {
this.entry.setCol(col_lastModified, mddlm);
this.entry.setCol(col_freshUntil, 0);
this.entry.setCol(col_wordsInTitle, 20); // guessed
this.entry.setCol(col_wordsInText, oldEntry.wordcount());
this.entry.setCol(col_phrasesInText, oldEntry.phrasecount());
this.entry.setCol(col_wordsInText, oldEntry.wordsintext());
this.entry.setCol(col_phrasesInText, oldEntry.phrasesintext());
this.entry.setCol(col_doctype, new byte[]{(byte) oldEntry.doctype()});
this.entry.setCol(col_language, (oldEntry.getLanguage() == null) ? "en" : oldEntry.getLanguage(), null);
this.entry.setCol(col_llocal, 0);
@ -231,6 +231,10 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry {
public long lastModified() {
return plasmaWordIndex.reverseMicroDateDays((int) this.entry.getColLong(col_lastModified));
}
public long freshUntil() {
return plasmaWordIndex.reverseMicroDateDays((int) this.entry.getColLong(col_freshUntil));
}
public int hitcount() {
return (int) this.entry.getColLong(col_hitcount);
@ -248,11 +252,11 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry {
return (int) this.entry.getColLong(col_posofphrase);
}
public int wordcount() {
public int wordsintext() {
return (int) this.entry.getColLong(col_wordsInText);
}
public int phrasecount() {
public int phrasesintext() {
return (int) this.entry.getColLong(col_phrasesInText);
}
@ -264,6 +268,26 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry {
return (char) this.entry.getColByte(col_doctype);
}
public int wordsintitle() {
return (int) this.entry.getColLong(col_wordsInTitle);
}
public int llocal() {
return (int) this.entry.getColLong(col_llocal);
}
public int lother() {
return (int) this.entry.getColLong(col_lother);
}
public int urllength() {
return (int) this.entry.getColLong(col_urlLength);
}
public int urlcomps() {
return (int) this.entry.getColLong(col_urlComps);
}
public kelondroBitfield flags() {
return new kelondroBitfield(this.entry.getColBytes(col_flags));
}
@ -278,7 +302,7 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry {
ie1.entry.setCol(col_posintext, Math.min(ie1.posintext(), ie2.posintext()));
ie1.entry.setCol(col_posinphrase, (ie1.posofphrase() == ie2.posofphrase()) ? ie1.posofphrase() : 0 /*unknown*/);
ie1.entry.setCol(col_posofphrase, Math.min(ie1.posofphrase(), ie2.posofphrase()));
ie1.entry.setCol(col_wordsInText, (ie1.wordcount() + ie2.wordcount()) / 2);
ie1.entry.setCol(col_wordsInText, (ie1.wordsintext() + ie2.wordsintext()) / 2);
return ie1;
}
@ -292,24 +316,30 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry {
public static final void min(indexRWIEntryNew t, indexRWIEntry other) {
if (t.hitcount() > other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount());
if (t.wordcount() > other.wordcount()) t.entry.setCol(col_wordsInText, other.wordcount());
if (t.phrasecount() > other.phrasecount()) t.entry.setCol(col_phrasesInText, other.phrasecount());
if (t.wordsintext() > other.wordsintext()) t.entry.setCol(col_wordsInText, other.wordsintext());
if (t.phrasesintext() > other.phrasesintext()) t.entry.setCol(col_phrasesInText, other.phrasesintext());
if (t.posintext() > other.posintext()) t.entry.setCol(col_posintext, other.posintext());
if (t.posinphrase() > other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase());
if (t.posofphrase() > other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase());
if (t.worddistance() > other.worddistance()) t.entry.setCol(col_worddistance, other.worddistance());
if (t.lastModified() > other.lastModified()) t.entry.setCol(col_lastModified, other.lastModified());
if (t.urllength() > other.urllength()) t.entry.setCol(col_urlLength, other.urllength());
if (t.urlcomps() > other.urlcomps()) t.entry.setCol(col_urlComps, other.urlcomps());
if (t.wordsintitle() > other.wordsintitle() ) t.entry.setCol(col_wordsInTitle, other.wordsintitle());
}
public static final void max(indexRWIEntryNew t, indexRWIEntry other) {
if (t.hitcount() < other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount());
if (t.wordcount() < other.wordcount()) t.entry.setCol(col_wordsInText, other.wordcount());
if (t.phrasecount() < other.phrasecount()) t.entry.setCol(col_phrasesInText, other.phrasecount());
if (t.wordsintext() < other.wordsintext()) t.entry.setCol(col_wordsInText, other.wordsintext());
if (t.phrasesintext() < other.phrasesintext()) t.entry.setCol(col_phrasesInText, other.phrasesintext());
if (t.posintext() < other.posintext()) t.entry.setCol(col_posintext, other.posintext());
if (t.posinphrase() < other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase());
if (t.posofphrase() < other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase());
if (t.worddistance() < other.worddistance()) t.entry.setCol(col_worddistance, other.worddistance());
if (t.lastModified() < other.lastModified()) t.entry.setCol(col_lastModified, other.lastModified());
if (t.urllength() < other.urllength()) t.entry.setCol(col_urlLength, other.urllength());
if (t.urlcomps() < other.urlcomps()) t.entry.setCol(col_urlComps, other.urlcomps());
if (t.wordsintitle() < other.wordsintitle() ) t.entry.setCol(col_wordsInTitle, other.wordsintitle());
}
@ -330,13 +360,17 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry {
//System.out.println("min = " + min.toPropertyForm(true));
//System.out.println("max = " + max.toPropertyForm(true));
t.entry.setCol(col_hitcount , (t.hitcount() == 0) ? 0 : 1 + 255 * (t.hitcount() - min.hitcount() ) / (1 + max.hitcount() - min.hitcount()));
t.entry.setCol(col_wordsInText , (t.wordcount() == 0) ? 0 : 1 + 255 * (t.wordcount() - min.wordcount() ) / (1 + max.wordcount() - min.wordcount()));
t.entry.setCol(col_phrasesInText, (t.phrasecount() == 0) ? 0 : 1 + 255 * (t.phrasecount() - min.phrasecount() ) / (1 + max.phrasecount() - min.phrasecount()));
t.entry.setCol(col_wordsInText , (t.wordsintext() == 0) ? 0 : 1 + 255 * (t.wordsintext() - min.wordsintext() ) / (1 + max.wordsintext() - min.wordsintext()));
t.entry.setCol(col_phrasesInText, (t.phrasesintext() == 0) ? 0 : 1 + 255 * (t.phrasesintext() - min.phrasesintext() ) / (1 + max.phrasesintext() - min.phrasesintext()));
t.entry.setCol(col_posintext , (t.posintext() == 0) ? 0 : 1 + 255 * (t.posintext() - min.posintext() ) / (1 + max.posintext() - min.posintext()));
t.entry.setCol(col_posinphrase , (t.posinphrase() == 0) ? 0 : 1 + 255 * (t.posinphrase() - min.posinphrase() ) / (1 + max.posinphrase() - min.posinphrase()));
t.entry.setCol(col_posofphrase , (t.posofphrase() == 0) ? 0 : 1 + 255 * (t.posofphrase() - min.posofphrase() ) / (1 + max.posofphrase() - min.posofphrase()));
t.entry.setCol(col_worddistance , (t.worddistance() == 0) ? 0 : 1 + 255 * (t.worddistance() - min.worddistance()) / (1 + max.worddistance() - min.worddistance())); // FIXME: hier gibts ein division by zero, was nur sein kann wenn die Normalisierung nicht geklappt hat.
t.entry.setCol(col_lastModified , (t.lastModified() == 0) ? 0 : 1 + 255 * (t.lastModified() - min.lastModified()) / (1 + max.lastModified() - min.lastModified()));
t.entry.setCol(col_urlLength , (t.urllength() == 0) ? 0 : 1 + 255 * (t.urllength() - min.urllength() ) / (1 + max.urllength() - min.urllength()));
t.entry.setCol(col_urlComps , (t.urlcomps() == 0) ? 0 : 1 + 255 * (t.urlcomps() - min.urlcomps() ) / (1 + max.urlcomps() - min.urlcomps()));
t.entry.setCol(col_wordsInTitle , (t.wordsintitle() == 0) ? 0 : 1 + 255 * (t.wordsintitle() - min.wordsintitle()) / (1 + max.wordsintitle() - min.wordsintitle()));
//System.out.println("out = " + t.toPropertyForm(true));
}

@ -189,11 +189,11 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
return (int) this.entry.getColLong(col_posofphrase);
}
public int wordcount() {
public int wordsintext() {
return (int) this.entry.getColLong(col_wordcount);
}
public int phrasecount() {
public int phrasesintext() {
return (int) this.entry.getColLong(col_phrasecount);
}
@ -215,7 +215,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
ie1.entry.setCol(col_posintext, Math.min(ie1.posintext(), ie2.posintext()));
ie1.entry.setCol(col_posinphrase, (ie1.posofphrase() == ie2.posofphrase()) ? ie1.posofphrase() : 0 /*unknown*/);
ie1.entry.setCol(col_posofphrase, Math.min(ie1.posofphrase(), ie2.posofphrase()));
ie1.entry.setCol(col_wordcount, (ie1.wordcount() + ie2.wordcount()) / 2);
ie1.entry.setCol(col_wordcount, (ie1.wordsintext() + ie2.wordsintext()) / 2);
return ie1;
}
@ -229,8 +229,8 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
public static final void min(indexRWIEntryOld t, indexRWIEntry other) {
if (t.hitcount() > other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount());
if (t.wordcount() > other.wordcount()) t.entry.setCol(col_wordcount, other.wordcount());
if (t.phrasecount() > other.phrasecount()) t.entry.setCol(col_phrasecount, other.phrasecount());
if (t.wordsintext() > other.wordsintext()) t.entry.setCol(col_wordcount, other.wordsintext());
if (t.phrasesintext() > other.phrasesintext()) t.entry.setCol(col_phrasecount, other.phrasesintext());
if (t.posintext() > other.posintext()) t.entry.setCol(col_posintext, other.posintext());
if (t.posinphrase() > other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase());
if (t.posofphrase() > other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase());
@ -241,8 +241,8 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
public static final void max(indexRWIEntryOld t, indexRWIEntry other) {
if (t.hitcount() < other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount());
if (t.wordcount() < other.wordcount()) t.entry.setCol(col_wordcount, other.wordcount());
if (t.phrasecount() < other.phrasecount()) t.entry.setCol(col_phrasecount, other.phrasecount());
if (t.wordsintext() < other.wordsintext()) t.entry.setCol(col_wordcount, other.wordsintext());
if (t.phrasesintext() < other.phrasesintext()) t.entry.setCol(col_phrasecount, other.phrasesintext());
if (t.posintext() < other.posintext()) t.entry.setCol(col_posintext, other.posintext());
if (t.posinphrase() < other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase());
if (t.posofphrase() < other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase());
@ -269,8 +269,8 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
//System.out.println("min = " + min.toPropertyForm(true));
//System.out.println("max = " + max.toPropertyForm(true));
t.entry.setCol(col_hitcount , (t.hitcount() == 0) ? 0 : 1 + 255 * (t.hitcount() - min.hitcount() ) / (1 + max.hitcount() - min.hitcount()));
t.entry.setCol(col_wordcount , (t.wordcount() == 0) ? 0 : 1 + 255 * (t.wordcount() - min.wordcount() ) / (1 + max.wordcount() - min.wordcount()));
t.entry.setCol(col_phrasecount , (t.phrasecount() == 0) ? 0 : 1 + 255 * (t.phrasecount() - min.phrasecount() ) / (1 + max.phrasecount() - min.phrasecount()));
t.entry.setCol(col_wordcount , (t.wordsintext() == 0) ? 0 : 1 + 255 * (t.wordsintext() - min.wordsintext() ) / (1 + max.wordsintext() - min.wordsintext()));
t.entry.setCol(col_phrasecount , (t.phrasesintext() == 0) ? 0 : 1 + 255 * (t.phrasesintext() - min.phrasesintext() ) / (1 + max.phrasesintext() - min.phrasesintext()));
t.entry.setCol(col_posintext , (t.posintext() == 0) ? 0 : 1 + 255 * (t.posintext() - min.posintext() ) / (1 + max.posintext() - min.posintext()));
t.entry.setCol(col_posinphrase , (t.posinphrase() == 0) ? 0 : 1 + 255 * (t.posinphrase() - min.posinphrase() ) / (1 + max.posinphrase() - min.posinphrase()));
t.entry.setCol(col_posofphrase , (t.posofphrase() == 0) ? 0 : 1 + 255 * (t.posofphrase() - min.posofphrase() ) / (1 + max.posofphrase() - min.posofphrase()));
@ -309,4 +309,24 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
return false;
}
public int llocal() {
return 0;
}
public int lother() {
return 0;
}
public int urlcomps() {
return 0;
}
public int urllength() {
return 0;
}
public int wordsintitle() {
return 0;
}
}

@ -47,111 +47,189 @@ import java.util.Map;
import java.util.Set;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIEntryNew;
import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield;
public class plasmaSearchRankingProfile {
// old parameters for ordering
public static final String ORDER_QUALITY = "Quality";
public static final String ORDER_DATE = "Date";
public static final String ORDER_YBR = "YBR";
// pre-sort attributes
public static final String ENTROPY = "entropy";
public static final String DATE = "date";
public static final String YBR = "ybr";
public static final String POSINTEXT = "posintext";
public static final String WORDDISTANCE = "worddistance";
public static final String HITCOUNT = "hitcount";
public static final String DOMLENGTH = "domlength";
public static final String DOMLENGTH = "domlength";
public static final String YBR = "ybr";
public static final String DATE = "date";
public static final String WORDSINTITLE = "wordsintitle";
public static final String WORDSINTEXT = "wordsintext";
public static final String PHRASESINTEXT = "phrasesintext";
public static final String LLOCAL = "llocal";
public static final String LOTHER = "lother";
public static final String URLLENGTH = "urllength";
public static final String URLCOMPS = "urlcomps";
public static final String HITCOUNT = "hitcount";
public static final String POSINTEXT = "posintext";
public static final String POSOFPHRASE = "posofphrase";
public static final String WORDDISTANCE = "worddistance";
public static final String APPURL = "appurl";
public static final String APPDESCR = "appdescr";
public static final String APPAUTHOR = "appauthor";
public static final String APPTAGS = "apptags";
public static final String APPREF = "appref";
public static final String APPEMPH = "appemph";
public static final String CATINDEXOF = "catindexof";
public static final String CATHASIMAGE = "cathasimage";
public static final String CATHASAUDIO = "cathasaudio";
public static final String CATHASVIDEO = "cathasvideo";
public static final String CATHASAPP = "cathasapp";
// post-sort attributes
public static final String URLLENGTH = "urllength";
public static final String URLCOMPS = "urlcomps";
public static final String DESCRLENGTH = "descrlength";
public static final String DESCRCOMPS = "descrcomps";
// post-sort predicates
public static final String QUERYINURL = "queryinurl";
public static final String QUERYINDESCR = "queryindescr";
public static final String URLCOMPINTOPLIST = "urlcompintoplist";
public static final String QUERYINURL = "queryinurl";
public static final String QUERYINDESCR = "queryindescr";
public static final String URLCOMPINTOPLIST = "urlcompintoplist";
public static final String DESCRCOMPINTOPLIST = "descrcompintoplist";
public static final String PREFER = "prefer";
private int
coeff_domlength, coeff_ybr, coeff_date, coeff_wordsintitle, coeff_wordsintext, coeff_phrasesintext,
coeff_llocal, coeff_lother, coeff_urllength, coeff_urlcomps, coeff_hitcount,
coeff_posintext, coeff_posofphrase, coeff_worddistance,
coeff_appurl, coeff_appdescr, coeff_appauthor, coeff_apptags, coeff_appref, coeff_appemph,
coeff_catindexof, coeff_cathasimage, coeff_cathasaudio, coeff_cathasvideo, coeff_cathasapp,
coeff_queryinurl, coeff_queryindescr, coeff_urlcompintoplist, coeff_descrcompintoplist, coeff_prefer;
public String[] order;
private HashMap coeff;
public plasmaSearchRankingProfile() {
// set some default-values
this.order = null;
this.coeff = new HashMap();
coeff.put(ENTROPY, new Integer(0));
coeff.put(DATE, new Integer(4));
coeff.put(YBR, new Integer(8));
coeff.put(POSINTEXT, new Integer(7));
coeff.put(WORDDISTANCE, new Integer(6));
coeff.put(HITCOUNT, new Integer(5));
coeff.put(DOMLENGTH, new Integer(8));
coeff.put(URLLENGTH, new Integer(15));
coeff.put(URLCOMPS, new Integer(15));
coeff.put(DESCRLENGTH, new Integer(4));
coeff.put(DESCRCOMPS, new Integer(4));
coeff.put(QUERYINURL, new Integer(13));
coeff.put(QUERYINDESCR, new Integer(8));
coeff.put(URLCOMPINTOPLIST, new Integer(3));
coeff.put(DESCRCOMPINTOPLIST, new Integer(2));
coeff.put(PREFER, new Integer(15));
public plasmaSearchRankingProfile(String mediatype) {
// set default-values
if (mediatype == null) mediatype = "text";
coeff_domlength = 8;
coeff_ybr = 8;
coeff_date = 4;
coeff_wordsintitle = 4;
coeff_wordsintext = 1;
coeff_phrasesintext = 1;
coeff_llocal = 2;
coeff_lother = 3;
coeff_urllength = 14;
coeff_urlcomps = 14;
coeff_hitcount = 5;
coeff_posintext = 7;
coeff_posofphrase = 6;
coeff_worddistance = 15;
coeff_appurl = 14;
coeff_appdescr = 13;
coeff_appauthor = 13;
coeff_apptags = 8;
coeff_appref = 9;
coeff_appemph = 11;
coeff_queryinurl = 12;
coeff_queryindescr = 8;
coeff_urlcompintoplist = 3;
coeff_descrcompintoplist = 2;
coeff_prefer = 15;
coeff_catindexof = (mediatype.equals("text")) ? 0 : 10;
coeff_cathasimage = (mediatype.equals("image")) ? 15 : 0;
coeff_cathasaudio = (mediatype.equals("audio")) ? 15 : 0;
coeff_cathasvideo = (mediatype.equals("video")) ? 15 : 0;
coeff_cathasapp = (mediatype.equals("app")) ? 15 : 0;
}
public plasmaSearchRankingProfile(String prefix, String profile) {
this(); // set defaults
//parse external form
String[] elts = profile.substring(1, profile.length() - 1).split(",");
int p;
int s = prefix.length();
String e;
for (int i = 0; i < elts.length; i++) {
e = elts[i].trim();
if ((s == 0) || (e.startsWith(prefix))) {
coeff.put(e.substring(s, (p = e.indexOf("="))), new Integer(Integer.parseInt(e.substring(p + 1))));
this("text"); // set defaults
if ((profile != null) && (profile.length() > 0)) {
//parse external form
HashMap coeff = new HashMap();
String[] elts = ((profile.startsWith("{") && (profile.endsWith("}"))) ? profile.substring(1, profile.length() - 1) : profile).split(",");
int p;
int s = (prefix == null) ? 0 : prefix.length();
String e;
for (int i = 0; i < elts.length; i++) {
e = elts[i].trim();
if ((s == 0) || (e.startsWith(prefix))) {
coeff.put(e.substring(s, (p = e.indexOf("="))), new Integer(Integer.parseInt(e.substring(p + 1))));
}
}
coeff_domlength = parseMap(coeff, DOMLENGTH, coeff_domlength);
coeff_ybr = parseMap(coeff, YBR, coeff_ybr);
coeff_date = parseMap(coeff, DATE, coeff_date);
coeff_wordsintitle = parseMap(coeff, WORDSINTITLE, coeff_wordsintitle);
coeff_wordsintext = parseMap(coeff, WORDSINTEXT, coeff_wordsintext);
coeff_phrasesintext = parseMap(coeff, PHRASESINTEXT, coeff_phrasesintext);
coeff_llocal = parseMap(coeff, LLOCAL, coeff_llocal);
coeff_lother = parseMap(coeff, LOTHER, coeff_lother);
coeff_urllength = parseMap(coeff, URLLENGTH, coeff_urllength);
coeff_urlcomps = parseMap(coeff, URLCOMPS, coeff_urlcomps);
coeff_hitcount = parseMap(coeff, HITCOUNT, coeff_hitcount);
coeff_posintext = parseMap(coeff, POSINTEXT, coeff_posintext);
coeff_posofphrase = parseMap(coeff, POSOFPHRASE, coeff_posofphrase);
coeff_worddistance = parseMap(coeff, WORDDISTANCE, coeff_worddistance);
coeff_appurl = parseMap(coeff, APPURL, coeff_appurl);
coeff_appdescr = parseMap(coeff, APPDESCR, coeff_appdescr);
coeff_appauthor = parseMap(coeff, APPAUTHOR, coeff_appauthor);
coeff_apptags = parseMap(coeff, APPTAGS, coeff_apptags);
coeff_appref = parseMap(coeff, APPREF, coeff_appref);
coeff_appemph = parseMap(coeff, APPEMPH, coeff_appemph);
coeff_catindexof = parseMap(coeff, APPEMPH, coeff_catindexof);
coeff_cathasimage = parseMap(coeff, APPEMPH, coeff_cathasimage);
coeff_cathasaudio = parseMap(coeff, APPEMPH, coeff_cathasaudio);
coeff_cathasvideo = parseMap(coeff, APPEMPH, coeff_cathasvideo);
coeff_cathasapp = parseMap(coeff, APPEMPH, coeff_cathasapp);
coeff_queryinurl = parseMap(coeff, QUERYINURL, coeff_queryinurl);
coeff_queryindescr = parseMap(coeff, QUERYINDESCR, coeff_queryindescr);
coeff_urlcompintoplist = parseMap(coeff, URLCOMPINTOPLIST, coeff_urlcompintoplist);
coeff_descrcompintoplist = parseMap(coeff, DESCRCOMPINTOPLIST, coeff_descrcompintoplist);
coeff_prefer = parseMap(coeff, PREFER, coeff_prefer);
}
}
public plasmaSearchRankingProfile(String[] order) {
this(); // set defaults
this.order = order;
// overwrite defaults with order attributes
for (int i = 0; i < 3; i++) {
if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_QUALITY)) coeff.put(ENTROPY, new Integer((3 * (3 - i))));
else if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_DATE)) coeff.put(DATE, new Integer((3 * (3 - i))));
else if (this.order[i].equals(plasmaSearchRankingProfile.ORDER_YBR)) coeff.put(YBR, new Integer((3 * (3 - i))));
private static int parseMap(HashMap coeff, String attr, int dflt) {
if (coeff.containsKey(attr)) try {
return Integer.parseInt((String) coeff.get(attr));
} catch (NumberFormatException e) {
return dflt;
} else {
return dflt;
}
}
public String orderString() {
if (order == null) return "YBR-Date-Quality";
return order[0] + "-" + order[1] + "-" + order[2];
}
public String toExternalString() {
return coeff.toString();
return toExternalMap("").toString();
}
public Map toExternalMap(String prefix) {
Iterator i = this.coeff.entrySet().iterator();
Map.Entry entry;
Map ext = new HashMap();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
ext.put(prefix + (String) entry.getKey(), entry.getValue());
}
ext.put(prefix + DOMLENGTH, Integer.toString(coeff_domlength));
ext.put(prefix + YBR, Integer.toString(coeff_ybr));
ext.put(prefix + DATE, Integer.toString(coeff_date));
ext.put(prefix + WORDSINTITLE, Integer.toString(coeff_wordsintitle));
ext.put(prefix + WORDSINTEXT, Integer.toString(coeff_wordsintext));
ext.put(prefix + PHRASESINTEXT, Integer.toString(coeff_phrasesintext));
ext.put(prefix + LLOCAL, Integer.toString(coeff_llocal));
ext.put(prefix + LOTHER, Integer.toString(coeff_lother));
ext.put(prefix + URLLENGTH, Integer.toString(coeff_urllength));
ext.put(prefix + URLCOMPS, Integer.toString(coeff_urlcomps));
ext.put(prefix + HITCOUNT, Integer.toString(coeff_hitcount));
ext.put(prefix + POSINTEXT, Integer.toString(coeff_posintext));
ext.put(prefix + POSOFPHRASE, Integer.toString(coeff_posofphrase));
ext.put(prefix + WORDDISTANCE, Integer.toString(coeff_worddistance));
ext.put(prefix + APPURL, Integer.toString(coeff_appurl));
ext.put(prefix + APPDESCR, Integer.toString(coeff_appdescr));
ext.put(prefix + APPAUTHOR, Integer.toString(coeff_appauthor));
ext.put(prefix + APPTAGS, Integer.toString(coeff_apptags));
ext.put(prefix + APPREF, Integer.toString(coeff_appref));
ext.put(prefix + APPEMPH, Integer.toString(coeff_appemph));
ext.put(prefix + CATINDEXOF, Integer.toString(coeff_catindexof));
ext.put(prefix + CATHASIMAGE, Integer.toString(coeff_cathasimage));
ext.put(prefix + CATHASAUDIO, Integer.toString(coeff_cathasaudio));
ext.put(prefix + CATHASVIDEO, Integer.toString(coeff_cathasvideo));
ext.put(prefix + CATHASAPP, Integer.toString(coeff_cathasapp));
ext.put(prefix + QUERYINURL, Integer.toString(coeff_queryinurl));
ext.put(prefix + QUERYINDESCR, Integer.toString(coeff_queryindescr));
ext.put(prefix + URLCOMPINTOPLIST, Integer.toString(coeff_urlcompintoplist));
ext.put(prefix + DESCRCOMPINTOPLIST, Integer.toString(coeff_descrcompintoplist));
ext.put(prefix + PREFER, Integer.toString(coeff_prefer));
return ext;
}
public String toExternalURLGet(String prefix) {
Iterator i = this.coeff.entrySet().iterator();
Iterator i = toExternalMap("").entrySet().iterator();
Map.Entry entry;
StringBuffer ext = new StringBuffer();
while (i.hasNext()) {
@ -168,15 +246,37 @@ public class plasmaSearchRankingProfile {
public long preRanking(indexRWIEntry normalizedEntry, String searchedWord) {
// the normalizedEntry must be a normalized indexEntry
long ranking = 0;
ranking += normalizedEntry.quality() << ((Integer) coeff.get(ENTROPY)).intValue();
ranking += normalizedEntry.virtualAge() << ((Integer) coeff.get(DATE)).intValue();
ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.urlHash()) << ((Integer) coeff.get(YBR)).intValue();
ranking += (normalizedEntry.posintext() == 0) ? 0 : (256 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue();
ranking += (normalizedEntry.worddistance() == 0) ? 0 : (256 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue();
ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue();
ranking += (256 - plasmaURL.domLengthNormalized(normalizedEntry.urlHash())) << ((Integer) coeff.get(DOMLENGTH)).intValue();
ranking += (plasmaURL.probablyRootURL(normalizedEntry.urlHash())) ? 16 << ((Integer) coeff.get(URLLENGTH)).intValue() : 0;
ranking += (plasmaURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord) != null) ? 256 << ((Integer) coeff.get(QUERYINURL)).intValue() : 0;
ranking += (256 - plasmaURL.domLengthNormalized(normalizedEntry.urlHash())) << coeff_domlength;
ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.urlHash()) << coeff_ybr;
ranking += normalizedEntry.virtualAge() << coeff_date;
ranking += normalizedEntry.wordsintitle() << coeff_wordsintitle;
ranking += normalizedEntry.wordsintext() << coeff_wordsintext;
ranking += normalizedEntry.phrasesintext() << coeff_phrasesintext;
ranking += normalizedEntry.llocal() << coeff_llocal;
ranking += normalizedEntry.lother() << coeff_lother;
ranking += (normalizedEntry.urllength() == 0) ? 0 : (256 - normalizedEntry.urllength()) << coeff_urllength;
ranking += (normalizedEntry.urlcomps() == 0) ? 0 : (256 - normalizedEntry.urlcomps()) << coeff_urlcomps;
ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << coeff_hitcount;
ranking += (normalizedEntry.posintext() == 0) ? 0 : (256 - normalizedEntry.posintext()) << coeff_posintext;
ranking += (normalizedEntry.posofphrase() == 0) ? 0 : (256 - normalizedEntry.hitcount()) << coeff_posofphrase;
ranking += (normalizedEntry.worddistance() == 0) ? 0 : (256 - normalizedEntry.worddistance()) << coeff_worddistance;
kelondroBitfield flags = normalizedEntry.flags();
ranking += (flags.get(indexRWIEntryNew.flag_app_url)) ? 256 << coeff_appurl : 0;
ranking += (flags.get(indexRWIEntryNew.flag_app_descr)) ? 256 << coeff_appdescr : 0;
ranking += (flags.get(indexRWIEntryNew.flag_app_author)) ? 256 << coeff_appauthor : 0;
ranking += (flags.get(indexRWIEntryNew.flag_app_tags)) ? 256 << coeff_apptags : 0;
ranking += (flags.get(indexRWIEntryNew.flag_app_reference)) ? 256 << coeff_appref : 0;
ranking += (flags.get(indexRWIEntryNew.flag_app_emphasized)) ? 256 << coeff_appemph : 0;
ranking += (flags.get(plasmaCondenser.flag_cat_indexof)) ? 256 << coeff_catindexof : 0;
ranking += (flags.get(plasmaCondenser.flag_cat_hasimage)) ? 256 << coeff_cathasimage : 0;
ranking += (flags.get(plasmaCondenser.flag_cat_hasaudio)) ? 256 << coeff_cathasaudio : 0;
ranking += (flags.get(plasmaCondenser.flag_cat_hasvideo)) ? 256 << coeff_cathasvideo : 0;
ranking += (flags.get(plasmaCondenser.flag_cat_hasapp)) ? 256 << coeff_cathasapp : 0;
ranking += (plasmaURL.probablyRootURL(normalizedEntry.urlHash())) ? 16 << coeff_urllength : 0;
ranking += (plasmaURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord) != null) ? 256 << coeff_queryinurl : 0;
/*
if (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord))
System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains word " + searchedWord + ", weighted " + ((Integer) coeff.get(QUERYINURL)).intValue() + ", ranking = " + ranking);
@ -199,15 +299,15 @@ public class plasmaSearchRankingProfile {
// prefer hit with 'prefer' pattern
indexURLEntry.Components comp = page.comp();
if (comp.url().toNormalform().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();
if (comp.descr().matches(query.prefer)) ranking += 256 << ((Integer) coeff.get(PREFER)).intValue();
if (comp.url().toNormalform().matches(query.prefer)) ranking += 256 << coeff_prefer;
if (comp.descr().matches(query.prefer)) ranking += 256 << coeff_prefer;
// apply 'common-sense' heuristic using references
for (int j = 0; j < urlcomps.length; j++) {
if (topwords.contains(urlcomps[j])) ranking += 256 << ((Integer) coeff.get(URLCOMPINTOPLIST)).intValue();
if (topwords.contains(urlcomps[j])) ranking += 256 << coeff_urlcompintoplist;
}
for (int j = 0; j < descrcomps.length; j++) {
if (topwords.contains(descrcomps[j])) ranking += 256 << ((Integer) coeff.get(DESCRCOMPINTOPLIST)).intValue();
if (topwords.contains(descrcomps[j])) ranking += 256 << coeff_descrcompintoplist;
}
// apply query-in-result matching
@ -217,18 +317,10 @@ public class plasmaSearchRankingProfile {
String queryhash;
while (shi.hasNext()) {
queryhash = (String) shi.next();
if (urlcomph.contains(queryhash)) ranking += 256 << ((Integer) coeff.get(QUERYINURL)).intValue();
if (descrcomph.contains(queryhash)) ranking += 256 << ((Integer) coeff.get(QUERYINDESCR)).intValue();
if (urlcomph.contains(queryhash)) ranking += 256 << coeff_queryinurl;
if (descrcomph.contains(queryhash)) ranking += 256 << coeff_queryindescr;
}
// prefer short urls
ranking += (256 - comp.url().toNormalform().length()) << ((Integer) coeff.get(URLLENGTH)).intValue();
ranking += (8 * Math.max(0, 32 - urlcomps.length)) << ((Integer) coeff.get(URLCOMPS)).intValue();
// prefer long descriptions
ranking += (256 * comp.url().toNormalform().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue();
ranking += (256 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue();
return ranking;
}

@ -61,7 +61,6 @@ import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.net.URL;
import de.anomic.plasma.cache.IResourceInfo;
@ -587,19 +586,20 @@ public class plasmaSnippetCache {
}
}
public ArrayList retrieveMediaSnippets(URL url, Set queryhashes, boolean fetchOnline, int timeout) {
public ArrayList retrieveMediaSnippets(URL url, Set queryhashes, String mediatype, boolean fetchOnline, int timeout) {
if (queryhashes.size() == 0) {
serverLog.logFine("snippet fetch", "no query hashes given for url " + url);
return new ArrayList();
}
if (mediatype == null) mediatype = "";
plasmaParserDocument document = retrieveDocument(url, fetchOnline, timeout, false);
ArrayList a = new ArrayList();
if (document != null) {
a.addAll(computeMediaSnippets(document, queryhashes, "audio"));
a.addAll(computeMediaSnippets(document, queryhashes, "video"));
a.addAll(computeMediaSnippets(document, queryhashes, "app"));
a.addAll(computeImageSnippets(document, queryhashes));
if ((mediatype.length() == 0) || (mediatype.equals("audio"))) a.addAll(computeMediaSnippets(document, queryhashes, "audio"));
if ((mediatype.length() == 0) || (mediatype.equals("video"))) a.addAll(computeMediaSnippets(document, queryhashes, "video"));
if ((mediatype.length() == 0) || (mediatype.equals("app" ))) a.addAll(computeMediaSnippets(document, queryhashes, "app"));
if ((mediatype.length() == 0) || (mediatype.equals("image"))) a.addAll(computeImageSnippets(document, queryhashes));
}
return a;
}
@ -838,7 +838,7 @@ public class plasmaSnippetCache {
return result;
}
/*
public void fetch(plasmaSearchResult acc, Set queryhashes, String urlmask, int fetchcount, long maxTime) {
// fetch snippets
int i = 0;
@ -879,5 +879,5 @@ public class plasmaSnippetCache {
log.logFine("snippetFetcher: got URL " + url + ", the snippet is '" + snippet.line + "', source=" + snippet.source);
}
}
*/
}

@ -2160,8 +2160,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// suppress line: there is no match in that resource
} else {*/
prop.put("type_results_" + i + "_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, "stippadd", "url", urlstring) == null) ? 1 : 0);
prop.put("type_results_" + i + "_recommend_deletelink", "/yacysearch.html?search=" + formerSearch + "&Enter=Search&count=" + query.wantedResults + "&order=" + ranking.orderString() + "&resource=local&time=3&deleteref=" + urlhash + "&urlmaskfilter=.*");
prop.put("type_results_" + i + "_recommend_recommendlink", "/yacysearch.html?search=" + formerSearch + "&Enter=Search&count=" + query.wantedResults + "&order=" + ranking.orderString() + "&resource=local&time=3&recommendref=" + urlhash + "&urlmaskfilter=.*");
prop.put("type_results_" + i + "_recommend_deletelink", "/yacysearch.html?search=" + formerSearch + "&Enter=Search&count=" + query.wantedResults + "&order=" + crypt.simpleEncode(ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + urlhash + "&urlmaskfilter=.*");
prop.put("type_results_" + i + "_recommend_recommendlink", "/yacysearch.html?search=" + formerSearch + "&Enter=Search&count=" + query.wantedResults + "&order=" + crypt.simpleEncode(ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + urlhash + "&urlmaskfilter=.*");
prop.put("type_results_" + i + "_description", comp.descr());
prop.put("type_results_" + i + "_url", urlstring);
prop.put("type_results_" + i + "_urlhash", urlhash);

@ -432,9 +432,9 @@ public final class yacyClient {
obj.put("filter", filter);
obj.put("ttl", "0");
obj.put("duetime", Long.toString(duetime));
obj.put("profile", timingProfile.targetToString()); // new duetimes splitted by specific search tasks
obj.put("timing", crypt.simpleEncode(timingProfile.targetToString())); // new duetimes splitted by specific search tasks
obj.put("maxdist", maxDistance);
obj.put("rankingProfile", rankingProfile.toExternalString());
obj.put("profile", crypt.simpleEncode(rankingProfile.toExternalString()));
obj.put("constraint", constraint.exportB64());
obj.put(yacySeed.MYTIME, yacyCore.universalDateShortString(new Date()));
if (abstractCache != null) obj.put("abstracts", "auto");

Loading…
Cancel
Save