- some refactoring in workflow

- some refactoring in search process
- fixed image search for json and rss output
- search navigation on bottom of search result page in cases where there are more than 6 results on page
- fixes for number of displayed documents
- disabled pseudostemming

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6504 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 969123385b
commit 491ba6a1ba

@ -47,9 +47,9 @@
<dt><label for="adminuser">Peer User:</label></dt>
<dd><input type="text" name="adminuser" id="adminuser" value="#[defaultUser]#" size="16" maxlength="32" /></dd>
<dt><label for="adminpw1">New Peer Password:</label></dt>
<dd><input type="password" name="adminpw1" id="adminpw1" value="" size="16" maxlength="32" /></dd>
<dd><input type="password" name="adminpw1" id="adminpw1" value="" size="16" maxlength="1024" /></dd>
<dt><label for="adminpw2">Repeat Peer Password:</label></dt>
<dd><input type="password" name="adminpw2" id="adminpw2" value="" size="16" maxlength="32" /></dd>
<dd><input type="password" name="adminpw2" id="adminpw2" value="" size="16" maxlength="1024" /></dd>
</dl>
</fieldset>
<input type="submit" name="setAdmin" value="Define Administrator" />

@ -407,7 +407,7 @@ public class IndexControlRWIs_p {
prop.putNum("genUrlList_urlList_"+i+"_urlExists_domlength", DigestURI.domLengthEstimation(entry.hash()));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_ybr", RankingProcess.ybr(entry.hash()));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_tf", 1000.0 * entry.word().termFrequency());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_authority", (ranked.getOrder() == null) ? -1 : ranked.getOrder().authority(entry.hash()));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_authority", (ranked.getQuery().getOrder() == null) ? -1 : ranked.getQuery().getOrder().authority(entry.hash()));
prop.put("genUrlList_urlList_"+i+"_urlExists_date", DateFormatter.formatShortDay(new Date(entry.word().lastModified())));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintitle", entry.word().wordsintitle());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintext", entry.word().wordsintext());
@ -502,8 +502,8 @@ public class IndexControlRWIs_p {
}
public static RankingProcess genSearchresult(final serverObjects prop, final Switchboard sb, Segment segment, final byte[] keyhash, final Bitfield filter) {
final QueryParams query = new QueryParams(new String(keyhash), -1, sb.getRanking(), filter);
final RankingProcess ranked = new RankingProcess(segment, query, Integer.MAX_VALUE, 1);
final QueryParams query = new QueryParams(new String(keyhash), -1, filter, segment, sb.getRanking());
final RankingProcess ranked = new RankingProcess(query, Integer.MAX_VALUE, 1);
ranked.run();
if (ranked.filteredCount() == 0) {

@ -371,7 +371,6 @@ div.yacylogo {
float:left;
}
/*----------
<h1>, <h2>, <h3>, <h4>, <h5>
*/

@ -53,6 +53,7 @@ import de.anomic.search.QueryParams;
import de.anomic.search.RankingProfile;
import de.anomic.search.SearchEvent;
import de.anomic.search.SearchEventCache;
import de.anomic.search.Segment;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
import de.anomic.search.ResultEntry;
@ -189,13 +190,13 @@ public final class search {
SearchEvent theSearch = null;
if ((query.length() == 0) && (abstractSet != null)) {
// this is _not_ a normal search, only a request for index abstracts
Segment indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
theQuery = new QueryParams(
null,
abstractSet,
new TreeSet<byte[]>(Base64Order.enhancedCoder),
null,
null,
rankingProfile,
maxdist,
prefer,
ContentDomain.contentdomParser(contentdom),
@ -213,13 +214,16 @@ public final class search {
authorhash,
DigestURI.TLD_any_zone_filter,
client,
false);
false,
indexSegment,
rankingProfile
);
theQuery.domType = QueryParams.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + QueryParams.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");
final long timer = System.currentTimeMillis();
//final Map<byte[], ReferenceContainer<WordReference>>[] containers = sb.indexSegment.index().searchTerm(theQuery.queryHashes, theQuery.excludeHashes, plasmaSearchQuery.hashes2StringSet(urls));
final HashMap<byte[], ReferenceContainer<WordReference>> incc = sb.indexSegments.termIndex(Segments.Process.PUBLIC).searchConjunction(theQuery.queryHashes, QueryParams.hashes2StringSet(urls));
final HashMap<byte[], ReferenceContainer<WordReference>> incc = indexSegment.termIndex().searchConjunction(theQuery.queryHashes, QueryParams.hashes2StringSet(urls));
MemoryTracker.update("SEARCH", new ProfilingGraph.searchEvent(theQuery.id(true), SearchEvent.COLLECTION, incc.size(), System.currentTimeMillis() - timer), false);
if (incc != null) {
@ -247,10 +251,9 @@ public final class search {
excludehashes,
null,
null,
rankingProfile,
maxdist,
prefer,
ContentDomain.contentdomParser(contentdom),
prefer,
ContentDomain.contentdomParser(contentdom),
language,
"", // no navigation
false,
@ -265,13 +268,16 @@ public final class search {
authorhash,
DigestURI.TLD_any_zone_filter,
client,
false);
false,
sb.indexSegments.segment(Segments.Process.PUBLIC),
rankingProfile
);
theQuery.domType = QueryParams.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + QueryParams.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");
RSSFeed.channels(RSSFeed.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), QueryParams.anonymizedQueryHashes(theQuery.queryHashes), ""));
// make event
theSearch = SearchEventCache.getEvent(theQuery, sb.indexSegments.segment(Segments.Process.PUBLIC), sb.peers, sb.crawlResults, null, true);
theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.crawlResults, null, true);
// set statistic details of search result and find best result index set
if (theSearch.getRankingResult().getLocalResourceSize() == 0) {

@ -165,6 +165,7 @@ var progressbar = new Progressbar(#[results]#, document.getElementById("results"
#(resultTable)#::</table>#(/resultTable)#
<!-- linklist end -->
<!-- attach the bottomline -->
#(pageNavBottom)#::<div id="pageNavBottom" align="center">#[resnav]#</div>#(/pageNavBottom)#
</div>
<div style="width=220px;">
<!--#include virtual="yacysearchtrailer.html?eventID=#[eventID]#&display=#[display]#" -->

@ -440,7 +440,6 @@ public class yacysearch {
Word.words2hashes(query[1]),
Word.words2hashes(query[2]),
tenant,
ranking,
maxDistance,
prefermask,
contentdomCode,
@ -459,7 +458,9 @@ public class yacysearch {
authorhash,
DigestURI.TLD_any_zone_filter,
client,
authenticated);
authenticated,
indexSegment,
ranking);
MemoryTracker.update("SEARCH", new ProfilingGraph.searchEvent(theQuery.id(true), SearchEvent.INITIALIZATION, 0, 0), false);
// tell all threads to do nothing for a specific time
@ -478,7 +479,7 @@ public class yacysearch {
theQuery.setOffset(0); // in case that this is a new search, always start without a offset
offset = 0;
}
final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, indexSegment, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false);
final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false);
// generate result object
//serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + (System.currentTimeMillis() - timestamp) + " ms");
@ -571,7 +572,7 @@ public class yacysearch {
final int totalcount = theSearch.getRankingResult().getLocalResourceSize() + theSearch.getRankingResult().getRemoteResourceSize();
prop.put("num-results_offset", offset);
prop.put("num-results_itemscount", "0");
prop.put("num-results_itemscount", Formatter.number(0, true));
prop.put("num-results_itemsPerPage", itemsPerPage);
prop.put("num-results_totalcount", Formatter.number(totalcount, true));
prop.put("num-results_globalresults", (globalsearch) ? "1" : "0");
@ -611,7 +612,10 @@ public class yacysearch {
resnav.append(QueryParams.navurl("html", thispage + 1, display, theQuery, originalUrlMask, null, navigation));
resnav.append("\"><img src=\"env/grafics/navdr.gif\" width=\"16\" height=\"16\"></a>");
}
prop.put("num-results_resnav", resnav.toString());
String resnavs = resnav.toString();
prop.put("num-results_resnav", resnavs);
prop.put("pageNavBottom", (totalcount - offset > 6) ? 1 : 0); // if there are more results than may fit on the page we add a navigation at the bottom
prop.put("pageNavBottom_resnav", resnavs);
// generate the search result lines; the content will be produced by another servlet
for (int i = 0; i < theQuery.displayResults(); i++) {

@ -86,10 +86,11 @@ public class yacysearchitem {
final QueryParams theQuery = theSearch.getQuery();
// dynamically update count values
final int totalcount = theSearch.getRankingResult().getLocalResourceSize() + theSearch.getRankingResult().getRemoteResourceSize();
final int offset = theQuery.neededResults() - theQuery.displayResults() + 1;
prop.put("offset", offset);
prop.put("itemscount", (item < 0) ? theQuery.neededResults() : item + 1);
prop.put("totalcount", Formatter.number(theSearch.getRankingResult().getLocalResourceSize() + theSearch.getRankingResult().getRemoteResourceSize(), true));
prop.put("itemscount", Formatter.number(Math.min((item < 0) ? theQuery.neededResults() : item + 1, totalcount)));
prop.put("totalcount", Formatter.number(totalcount, true));
prop.put("localResourceSize", Formatter.number(theSearch.getRankingResult().getLocalResourceSize(), true));
prop.put("remoteResourceSize", Formatter.number(theSearch.getRankingResult().getRemoteResourceSize(), true));
prop.put("remoteIndexCount", Formatter.number(theSearch.getRankingResult().getRemoteIndexCount(), true));
@ -169,7 +170,7 @@ public class yacysearchitem {
prop.putHTML("content_item_href", ms.href.toNormalform(true, false));
prop.put("content_item_code", sb.licensedURLs.aquireLicense(ms.href));
prop.putHTML("content_item_name", shorten(ms.name, namelength));
prop.put("content_item_mime", ms.mime);
prop.put("content_item_mimetype", ms.mime);
prop.put("content_item_fileSize", ms.fileSize);
prop.put("content_item_width", ms.width);
prop.put("content_item_height", ms.height);

@ -9,7 +9,8 @@
<yacy:path>#[path]#</yacy:path>
<yacy:file>#[file]#</yacy:file>
<guid isPermaLink="false">#[urlhash]#</guid>
</item>::#(item)#::<item>
</item>::
#(item)#::<item>
<title>#[name]#</title>
<link>#[source]#</link>
<description></description>
@ -18,32 +19,32 @@
<yacy:host>#[sourcedom]#</yacy:host>
<media:group>
<media:content
url="#[href]#"
fileSize="#[fileSize]#"
type="#[mime]#"
medium="image"
isDefault="true"
expression="full"
height="#[width]#"
width="#[height]#" />
url="#[href]#"
fileSize="#[fileSize]#"
type="#[mimetype]#"
medium="image"
isDefault="true"
expression="full"
height="#[width]#"
width="#[height]#" />
<media:content
url="#[hrefCache]#"
fileSize="#[fileSize]#"
type="#[mime]#"
medium="image"
isDefault="false"
expression="full"
height="#[width]#"
width="#[height]#" />
url="#[hrefCache]#"
fileSize="#[fileSize]#"
type="#[mimetype]#"
medium="image"
isDefault="false"
expression="full"
height="#[width]#"
width="#[height]#" />
<media:content
url="/ViewImage.png?maxwidth=96&amp;maxheight=96&amp;code=#[code]#"
fileSize="#[fileSize]#"
type="#[mime]#"
medium="image"
isDefault="false"
expression="sample"
height="96"
width="96" />
url="/ViewImage.png?maxwidth=96&amp;maxheight=96&amp;code=#[code]#"
fileSize="#[fileSize]#"
type="#[mimetype]#"
medium="image"
isDefault="false"
expression="sample"
height="96"
width="96" />
</media:group>
</item>#(/item)#::
#(/content)#

@ -194,15 +194,13 @@ public class DocumentIndex extends Segment {
public static final ArrayList<URIMetadataRow> findMetadata(
final String querystring,
final Segment indexSegment) {
QueryParams query = new QueryParams(querystring, 100, textRankingDefault, null);
return findMetadata(query, indexSegment);
QueryParams query = new QueryParams(querystring, 100, null, indexSegment, textRankingDefault);
return findMetadata(query);
}
public static final ArrayList<URIMetadataRow> findMetadata(
final QueryParams query,
final Segment indexSegment) {
public static final ArrayList<URIMetadataRow> findMetadata(final QueryParams query) {
RankingProcess rankedCache = new RankingProcess(indexSegment, query, 1000, 2);
RankingProcess rankedCache = new RankingProcess(query, 1000, 2);
rankedCache.run();
ArrayList<URIMetadataRow> result = new ArrayList<URIMetadataRow>();

@ -71,6 +71,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
public MediaSnippet(final ContentDomain type, final DigestURI href, final String mime, final String name, final long fileSize, final int width, final int height, final int ranking, final DigestURI source) {
this.type = type;
this.href = href;
this.mime = mime;
this.fileSize = fileSize;
this.source = source; // the web page where the media resource appeared
this.name = name;

@ -75,6 +75,8 @@ public final class QueryParams {
public boolean allofconstraint;
public boolean onlineSnippetFetch;
public RankingProfile ranking;
private Segment indexSegment;
private final ReferenceOrder order;
public String host; // this is the client host that starts the query, not a site operator
public String sitehash; // this is a domain hash, 6 bytes long or null
public String authorhash;
@ -88,8 +90,9 @@ public final class QueryParams {
public QueryParams(final String queryString,
final int itemsPerPage,
final RankingProfile ranking,
final Bitfield constraint) {
final Bitfield constraint,
final Segment indexSegment,
final RankingProfile ranking) {
if ((queryString.length() == 12) && (Base64Order.enhancedCoder.wellformed(queryString.getBytes()))) {
this.queryString = null;
this.queryHashes = new TreeSet<byte[]>(Base64Order.enhancedCoder);
@ -124,6 +127,8 @@ public final class QueryParams {
this.handle = Long.valueOf(System.currentTimeMillis());
this.specialRights = false;
this.navigators = "all";
this.order = new ReferenceOrder(this.ranking, this.targetlang);
this.indexSegment = indexSegment;
}
public QueryParams(
@ -131,7 +136,6 @@ public final class QueryParams {
final TreeSet<byte[]> excludeHashes,
final TreeSet<byte[]> fullqueryHashes,
final String tenant,
final RankingProfile ranking,
final int maxDistance, final String prefer, final ContentDomain contentdom,
final String language,
final String navigators,
@ -143,7 +147,9 @@ public final class QueryParams {
final String authorhash,
final int domainzone,
final String host,
final boolean specialRights) {
final boolean specialRights,
final Segment indexSegment,
final RankingProfile ranking) {
this.queryString = queryString;
this.queryHashes = queryHashes;
this.excludeHashes = excludeHashes;
@ -171,6 +177,16 @@ public final class QueryParams {
this.remotepeer = null;
this.handle = Long.valueOf(System.currentTimeMillis());
this.specialRights = specialRights;
this.order = new ReferenceOrder(this.ranking, this.targetlang);
this.indexSegment = indexSegment;
}
public ReferenceOrder getOrder() {
return this.order;
}
public Segment getSegment() {
return this.indexSegment;
}
public int neededResults() {

@ -65,10 +65,8 @@ public final class RankingProcess extends Thread {
private static boolean useYBR = true;
private static final int maxDoubleDomAll = 20, maxDoubleDomSpecial = 10000;
private final Segment indexSegment;
private final QueryParams query;
private final int maxentries;
private final ReferenceOrder order;
private final ConcurrentHashMap<String, Integer> urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
private final int[] flagcount; // flag counter
private final TreeSet<String> misses; // contains url-hashes that could not been found in the LURL-DB
@ -86,11 +84,7 @@ public final class RankingProcess extends Thread {
private final ConcurrentHashMap<String, AuthorInfo> authorNavigator;
public RankingProcess(
final Segment indexSegment,
final QueryParams query,
final int maxentries,
final int concurrency) {
public RankingProcess(final QueryParams query, final int maxentries, final int concurrency) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
// sortorder: 0 = hash, 1 = url, 2 = ranking
@ -98,7 +92,6 @@ public final class RankingProcess extends Thread {
this.stack = new SortStack<WordReferenceVars>(maxentries);
this.doubleDomCache = new HashMap<String, SortStack<WordReferenceVars>>();
this.handover = new HashSet<String>();
this.order = (query == null) ? null : new ReferenceOrder(query.ranking, query.targetlang);
this.query = query;
this.maxentries = maxentries;
this.remote_peerCount = 0;
@ -107,7 +100,6 @@ public final class RankingProcess extends Thread {
this.local_resourceSize = 0;
this.urlhashes = new ConcurrentHashMap<String, Integer>(0, 0.75f, concurrency);
this.misses = new TreeSet<String>();
this.indexSegment = indexSegment;
this.flagcount = new int[32];
for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;}
this.hostNavigator = new ConcurrentHashMap<String, HostInfo>();
@ -119,6 +111,10 @@ public final class RankingProcess extends Thread {
assert this.feeders >= 1;
}
public QueryParams getQuery() {
return this.query;
}
public void run() {
// do a search
@ -126,7 +122,7 @@ public final class RankingProcess extends Thread {
// so following sortings together with the global results will be fast
try {
long timer = System.currentTimeMillis();
final TermSearch<WordReference> search = this.indexSegment.termIndex().query(
final TermSearch<WordReference> search = this.query.getSegment().termIndex().query(
query.queryHashes,
query.excludeHashes,
null,
@ -146,14 +142,6 @@ public final class RankingProcess extends Thread {
oneFeederTerminated();
}
public long ranking(final WordReferenceVars word) {
return order.cardinal(word);
}
public int[] zones() {
return this.domZones;
}
public void add(final ReferenceContainer<WordReference> index, final boolean local, final int fullResource) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
@ -170,7 +158,7 @@ public final class RankingProcess extends Thread {
long timer = System.currentTimeMillis();
// normalize entries
final BlockingQueue<WordReferenceVars> decodedEntries = this.order.normalizeWith(index);
final BlockingQueue<WordReferenceVars> decodedEntries = this.query.getOrder().normalizeWith(index);
MemoryTracker.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), SearchEvent.NORMALIZING, index.size(), System.currentTimeMillis() - timer), false);
// iterate over normalized entries and select some that are better than currently stored
@ -244,7 +232,7 @@ public final class RankingProcess extends Thread {
for (WordReferenceVars fEntry: filteredEntries) {
// kick out entries that are too bad according to current findings
r = Long.valueOf(order.cardinal(fEntry));
r = Long.valueOf(this.query.getOrder().cardinal(fEntry));
assert maxentries != 0;
if ((maxentries >= 0) && (stack.size() >= maxentries) && (stack.bottom(r.longValue()))) continue;
@ -367,6 +355,15 @@ public final class RankingProcess extends Thread {
return bestEntry;
}
/**
* get one metadata entry from the ranked results. This will be the 'best' entry so far
* according to the applied ranking. If there are no more entries left or the timeout
* limit is reached then null is returned. The caller may distinguish the timeout case
* from the case where there will be no more also in the future by calling this.feedingIsFinished()
* @param skipDoubleDom should be true if it is wanted that double domain entries are skipped
* @param timeout the time this method may take for a result computation
* @return a metadata entry for a url
*/
public URIMetadataRow takeURL(final boolean skipDoubleDom, final int timeout) {
// returns from the current RWI list the best URL entry and removes this entry from the list
long timeLimit = System.currentTimeMillis() + timeout;
@ -377,7 +374,7 @@ public final class RankingProcess extends Thread {
try {Thread.sleep(50);} catch (final InterruptedException e1) {}
continue;
}
final URIMetadataRow page = indexSegment.urlMetadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue());
final URIMetadataRow page = this.query.getSegment().urlMetadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue());
if (page == null) {
misses.add(obrwi.element.metadataHash());
continue;
@ -412,7 +409,7 @@ public final class RankingProcess extends Thread {
(query.constraint.get(Condenser.flag_cat_indexof)) &&
(!(pagetitle.startsWith("index of")))) {
final Iterator<byte[]> wi = query.queryHashes.iterator();
while (wi.hasNext()) try { indexSegment.termIndex().remove(wi.next(), page.hash()); } catch (IOException e) {}
while (wi.hasNext()) try { this.query.getSegment().termIndex().remove(wi.next(), page.hash()); } catch (IOException e) {}
continue;
}
@ -564,7 +561,7 @@ public final class RankingProcess extends Thread {
DigestURI url;
String hostname;
for (int i = 0; i < rc; i++) {
mr = indexSegment.urlMetadata().load(hsa[i].hashsample, null, 0);
mr = this.query.getSegment().urlMetadata().load(hsa[i].hashsample, null, 0);
if (mr == null) continue;
url = mr.metadata().url();
if (url == null) continue;
@ -655,10 +652,6 @@ public final class RankingProcess extends Thread {
return result;
}
public ReferenceOrder getOrder() {
return this.order;
}
public static void loadYBR(final File rankingPath, final int count) {
// load ranking tables
if (rankingPath.exists()) {

@ -54,7 +54,6 @@ public class ResultFetcher {
// input values
final RankingProcess rankedCache; // ordered search results, grows dynamically as all the query threads enrich this container
QueryParams query;
private final Segment indexSegment;
private final yacySeedDB peers;
// result values
@ -71,13 +70,11 @@ public class ResultFetcher {
public ResultFetcher(
RankingProcess rankedCache,
final QueryParams query,
final Segment indexSegment,
final yacySeedDB peers,
final int taketimeout) {
this.rankedCache = rankedCache;
this.query = query;
this.indexSegment = indexSegment;
this.peers = peers;
this.taketimeout = taketimeout;
@ -121,7 +118,6 @@ public class ResultFetcher {
return false;
}
public long getURLRetrievalTime() {
return this.urlRetrievalAllTime;
}
@ -166,7 +162,7 @@ public class ResultFetcher {
if (page == null) break;
if (failedURLs.get(page.hash()) != null) continue;
final ResultEntry resultEntry = fetchSnippet(page, snippetMode);
final ResultEntry resultEntry = fetchSnippet(page, snippetMode); // does not fetch snippets if snippetMode == 0
if (resultEntry == null) continue; // the entry had some problems, cannot be used
if (result.exists(resultEntry)) continue;
@ -177,7 +173,7 @@ public class ResultFetcher {
// place the result to the result vector
// apply post-ranking
long ranking = Long.valueOf(rankedCache.getOrder().cardinal(resultEntry.word()));
long ranking = Long.valueOf(query.getOrder().cardinal(resultEntry.word()));
ranking += postRanking(resultEntry, rankedCache.getTopics());
//System.out.println("*** resultEntry.hash = " + resultEntry.hash());
result.push(resultEntry, ranking);
@ -209,7 +205,7 @@ public class ResultFetcher {
final long dbRetrievalTime = System.currentTimeMillis() - startTime;
if (snippetMode == 0) {
return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, 0); // result without snippet
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, 0); // result without snippet
}
// load snippet
@ -222,17 +218,17 @@ public class ResultFetcher {
if (snippet.getErrorCode() < 11) {
// we loaded the file and found the snippet
return new ResultEntry(page, indexSegment, peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
return new ResultEntry(page, query.getSegment(), peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
} else if (snippetMode == 1) {
// we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
// this may happen during a remote search, because snippet loading is omitted to retrieve results faster
return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
} else {
// problems with snippet fetch
registerFailure(page.hash(), "no text snippet for URL " + metadata.url());
if (!peers.mySeed().isVirgin())
try {
TextSnippet.failConsequences(this.indexSegment, page.word(), snippet, query.id(false));
TextSnippet.failConsequences(query.getSegment(), page.word(), snippet, query.id(false));
} catch (IOException e) {
Log.logException(e);
}
@ -247,9 +243,9 @@ public class ResultFetcher {
if ((mediaSnippets != null) && (mediaSnippets.size() > 0)) {
// found media snippets, return entry
return new ResultEntry(page, indexSegment, peers, null, mediaSnippets, dbRetrievalTime, snippetComputationTime);
return new ResultEntry(page, query.getSegment(), peers, null, mediaSnippets, dbRetrievalTime, snippetComputationTime);
} else if (snippetMode == 1) {
return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, snippetComputationTime);
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime);
} else {
// problems with snippet fetch
registerFailure(page.hash(), "no media snippet for URL " + metadata.url());

@ -64,7 +64,6 @@ public final class SearchEvent {
// class variables that may be implemented with an abstract class
private long eventTime;
private QueryParams query;
private final Segment indexSegment;
private final yacySeedDB peers;
private RankingProcess rankedCache; // ordered search results, grows dynamically as all the query threads enrich this container
private ResultFetcher results;
@ -82,13 +81,11 @@ public final class SearchEvent {
private byte[] IAmaxcounthash, IAneardhthash;
@SuppressWarnings("unchecked") SearchEvent(final QueryParams query,
final Segment indexSegment,
final yacySeedDB peers,
final ResultURLs crawlResults,
final TreeMap<byte[], String> preselectedPeerHashes,
final boolean generateAbstracts) {
this.eventTime = System.currentTimeMillis(); // for lifetime check
this.indexSegment = indexSegment;
this.peers = peers;
this.crawlResults = crawlResults;
this.query = query;
@ -109,7 +106,7 @@ public final class SearchEvent {
// initialize a ranking process that is the target for data
// that is generated concurrently from local and global search threads
this.rankedCache = new RankingProcess(indexSegment, query, max_results_preparation, fetchpeers + 1);
this.rankedCache = new RankingProcess(query, max_results_preparation, fetchpeers + 1);
// start a local search concurrently
this.rankedCache.start();
@ -128,7 +125,7 @@ public final class SearchEvent {
query.authorhash == null ? "" : query.authorhash,
query.displayResults(),
query.maxDistance,
indexSegment,
query.getSegment(),
peers,
crawlResults,
rankedCache,
@ -149,10 +146,10 @@ public final class SearchEvent {
}
// start worker threads to fetch urls and snippets
this.results = new ResultFetcher(rankedCache, query, indexSegment, peers, 10000);
this.results = new ResultFetcher(rankedCache, query, peers, 10000);
} else {
// do a local search
this.rankedCache = new RankingProcess(indexSegment, query, max_results_preparation, 2);
this.rankedCache = new RankingProcess(query, max_results_preparation, 2);
this.rankedCache.run();
//CrawlSwitchboard.Finding finding = wordIndex.retrieveURLs(query, false, 2, ranking, process);
@ -184,7 +181,7 @@ public final class SearchEvent {
}
// start worker threads to fetch urls and snippets
this.results = new ResultFetcher(rankedCache, query, indexSegment, peers, 10);
this.results = new ResultFetcher(rankedCache, query, peers, 10);
}
// clean up events
@ -223,7 +220,7 @@ public final class SearchEvent {
final Iterator<byte[]> j = removeWords.iterator();
// remove the same url hashes for multiple words
while (j.hasNext()) {
this.indexSegment.termIndex().remove(j.next(), this.results.failedURLs.keySet());
this.query.getSegment().termIndex().remove(j.next(), this.results.failedURLs.keySet());
}
} catch (IOException e) {
Log.logException(e);
@ -376,7 +373,7 @@ public final class SearchEvent {
//System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " has urls: " + urls);
//System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " from words: " + words);
secondarySearchThreads[c++] = yacySearch.secondaryRemoteSearch(
words, "", urls, indexSegment, peers, crawlResults, this.rankedCache, peer, Switchboard.urlBlacklist,
words, "", urls, this.query.getSegment(), peers, crawlResults, this.rankedCache, peer, Switchboard.urlBlacklist,
query.ranking, query.constraint, preselectedPeerHashes);
}

@ -66,7 +66,6 @@ public class SearchEventCache {
public static SearchEvent getEvent(
final QueryParams query,
final Segment indexSegment,
final yacySeedDB peers,
final ResultURLs crawlResults,
final TreeMap<byte[], String> preselectedPeerHashes,
@ -90,7 +89,7 @@ public class SearchEventCache {
}
if (event == null) {
// start a new event
event = new SearchEvent(query, indexSegment, peers, crawlResults, preselectedPeerHashes, generateAbstracts);
event = new SearchEvent(query, peers, crawlResults, preselectedPeerHashes, generateAbstracts);
}
return event;

@ -603,7 +603,7 @@ public final class Switchboard extends serverSwitch {
int indexerThreads = Math.max(1, WorkflowProcessor.useCPU / 2);
this.indexingStorageProcessor = new WorkflowProcessor<indexingQueueEntry>(
"storeDocumentIndex",
"This is the sequencing step of the indexing queue: no concurrency is wanted here, because the access of the indexer works better if it is not concurrent. Files are written as streams, councurrency would destroy IO performance. In this process the words are written to the RWI cache, which flushes if it is full.",
"This is the sequencing step of the indexing queue. Files are written as streams, too much councurrency would destroy IO performance. In this process the words are written to the RWI cache, which flushes if it is full.",
new String[]{"RWI/Cache/Collections"},
this, "storeDocumentIndex", WorkflowProcessor.useCPU + 40, null, indexerThreads);
this.indexingAnalysisProcessor = new WorkflowProcessor<indexingQueueEntry>(

@ -61,6 +61,7 @@ import net.yacy.kelondro.util.SetTools;
public final class Condenser {
// this is the page analysis class
final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form
// category flags that show how the page can be distinguished in different interest groups
public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of')
@ -110,7 +111,7 @@ public final class Condenser {
) throws UnsupportedEncodingException {
// if addMedia == true, then all the media links are also parsed and added to the words
// added media words are flagged with the appropriate media flag
this.wordminsize = 3;
this.wordminsize = 2;
this.wordcut = 2;
this.words = new HashMap<String, Word>();
this.RESULT_FLAGS = new Bitfield(4);
@ -408,39 +409,41 @@ public final class Condenser {
}
}
Map.Entry<String, Word> entry;
// we search for similar words and reorganize the corresponding sentences
// a word is similar, if a shortened version is equal
final Iterator<Map.Entry<String, Word>> wi = words.entrySet().iterator(); // enumerates the keys in descending order
wordsearch: while (wi.hasNext()) {
entry = wi.next();
word = entry.getKey();
wordlen = word.length();
wsp = entry.getValue();
for (int i = wordcut; i > 0; i--) {
if (wordlen > i) {
k = word.substring(0, wordlen - i);
if (words.containsKey(k)) {
// we will delete the word 'word' and repoint the
// corresponding links
// in sentences that use this word
wsp1 = words.get(k);
final Iterator<Integer> it1 = wsp.phrases(); // we iterate over all sentences that refer to this word
while (it1.hasNext()) {
idx = it1.next().intValue(); // number of a sentence
s = (String[]) orderedSentences[idx];
for (int j = 2; j < s.length; j++) {
if (s[j].equals(intString(wsp.posInText, numlength)))
s[j] = intString(wsp1.posInText, numlength);
if (pseudostemming) {
Map.Entry<String, Word> entry;
// we search for similar words and reorganize the corresponding sentences
// a word is similar, if a shortened version is equal
final Iterator<Map.Entry<String, Word>> wi = words.entrySet().iterator(); // enumerates the keys in descending order
wordsearch: while (wi.hasNext()) {
entry = wi.next();
word = entry.getKey();
wordlen = word.length();
wsp = entry.getValue();
for (int i = wordcut; i > 0; i--) {
if (wordlen > i) {
k = word.substring(0, wordlen - i);
if (words.containsKey(k)) {
// we will delete the word 'word' and repoint the
// corresponding links
// in sentences that use this word
wsp1 = words.get(k);
final Iterator<Integer> it1 = wsp.phrases(); // we iterate over all sentences that refer to this word
while (it1.hasNext()) {
idx = it1.next().intValue(); // number of a sentence
s = (String[]) orderedSentences[idx];
for (int j = 2; j < s.length; j++) {
if (s[j].equals(intString(wsp.posInText, numlength)))
s[j] = intString(wsp1.posInText, numlength);
}
orderedSentences[idx] = s;
}
orderedSentences[idx] = s;
// update word counter
wsp1.count = wsp1.count + wsp.count;
words.put(k, wsp1);
// remove current word
wi.remove();
continue wordsearch;
}
// update word counter
wsp1.count = wsp1.count + wsp.count;
words.put(k, wsp1);
// remove current word
wi.remove();
continue wordsearch;
}
}
}

@ -44,6 +44,10 @@ public class SortStack<E> {
private ConcurrentHashMap<E, Object> instack; // keeps track which element has been on the stack
protected int maxsize;
public SortStack() {
this(-1);
}
public SortStack(final int maxsize) {
// the maxsize is the maximum number of entries in the stack
// if this is set to -1, the size is unlimited

@ -28,6 +28,8 @@ package net.yacy.kelondro.workflow;
public interface BlockingThread<J extends WorkflowJob> extends WorkflowThread {
public void setManager(WorkflowProcessor<J> queue);
public WorkflowProcessor<J> getManager();
public J job(J next) throws Exception;

@ -55,6 +55,20 @@ public class InstantBlockingThread<J extends WorkflowJob> extends AbstractBlocki
this.handle = Long.valueOf(System.currentTimeMillis() + this.getName().hashCode());
}
public InstantBlockingThread(final Object env, final Method jobExecMethod, final WorkflowProcessor<J> manager) {
// jobExec is the name of a method of the object 'env' that executes the one-step-run
// jobCount is the name of a method that returns the size of the job
// set the manager of blocking queues for input and output
this.setManager(manager);
// define execution class
this.jobExecMethod = jobExecMethod;
this.environment = (env instanceof Class<?>) ? null : env;
this.setName(jobExecMethod.getClass().getName() + "." + jobExecMethod.getName() + "." + handleCounter++);
this.handle = Long.valueOf(System.currentTimeMillis() + this.getName().hashCode());
}
protected static Method execMethod(final Object env, final String jobExec) {
final Class<?> theClass = (env instanceof Class<?>) ? (Class<?>) env : env.getClass();
try {

@ -1,4 +1,4 @@
// serverProcessor.java
// WorkflowJob.java
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 29.02.2008 on http://yacy.net
//

@ -56,7 +56,8 @@ public class WorkflowProcessor<J extends WorkflowJob> {
public WorkflowProcessor(
String name, String description, String[] childnames,
final Object env, final String jobExecMethod, final int inputQueueSize, final WorkflowProcessor<J> output, final int poolsize) {
final Object env, final String jobExecMethod,
final int inputQueueSize, final WorkflowProcessor<J> output, final int poolsize) {
// start a fixed number of executors that handle entries in the process queue
this.environment = env;
this.processName = name;
@ -135,7 +136,7 @@ public class WorkflowProcessor<J extends WorkflowJob> {
Log.logWarning("PROCESSOR", "executing job " + environment.getClass().getName() + "." + methodName + " serialized");
try {
final J out = (J) InstantBlockingThread.execMethod(this.environment, this.methodName).invoke(environment, new Object[]{in});
if ((out != null) && (output != null)) output.enQueue(out);
if (out != null && this.output != null) this.output.enQueue(out);
} catch (final IllegalArgumentException e) {
Log.logException(e);
} catch (final IllegalAccessException e) {

Loading…
Cancel
Save