|
|
|
// plasmaSearchContainer.java
|
|
|
|
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
|
|
|
// first published 29.8.2007 on http://yacy.net
|
|
|
|
//
|
|
|
|
// This is a part of YaCy, a peer-to-peer based web search engine
|
|
|
|
//
|
|
|
|
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
|
|
|
// $LastChangedRevision: 1986 $
|
|
|
|
// $LastChangedBy: orbiter $
|
|
|
|
//
|
|
|
|
// LICENSE
|
|
|
|
//
|
|
|
|
// This program is free software; you can redistribute it and/or modify
|
|
|
|
// it under the terms of the GNU General Public License as published by
|
|
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
|
|
// (at your option) any later version.
|
|
|
|
//
|
|
|
|
// This program is distributed in the hope that it will be useful,
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
// GNU General Public License for more details.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the GNU General Public License
|
|
|
|
// along with this program; if not, write to the Free Software
|
|
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
|
|
|
|
|
|
|
|
package de.anomic.plasma;
|
|
|
|
|
|
|
|
import java.util.HashSet;
|
|
|
|
import java.util.Iterator;
|
|
|
|
import java.util.Set;
|
|
|
|
import java.util.TreeSet;
|
|
|
|
|
|
|
|
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
|
|
|
import de.anomic.index.indexContainer;
|
|
|
|
import de.anomic.index.indexRWIEntry;
|
|
|
|
import de.anomic.kelondro.kelondroMScoreCluster;
|
|
|
|
|
|
|
|
public class plasmaSearchContainer {
|
|
|
|
|
|
|
|
private indexRWIEntry entryMin, entryMax;
|
|
|
|
private indexContainer container;
|
|
|
|
private plasmaSearchRankingProfile ranking;
|
|
|
|
private TreeSet searchedWords;
|
|
|
|
private int globalcount;
|
|
|
|
private HashSet urlhashes; // set for double-check
|
|
|
|
private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic
|
|
|
|
private plasmaSearchQuery query;
|
|
|
|
|
|
|
|
|
|
|
|
public plasmaSearchContainer(plasmaSearchQuery query, plasmaSearchRankingProfile ranking, TreeSet searchedWords) {
|
|
|
|
this(query, ranking, searchedWords, plasmaWordIndex.emptyContainer(null, 0));
|
|
|
|
}
|
|
|
|
|
|
|
|
public plasmaSearchContainer(plasmaSearchQuery query, plasmaSearchRankingProfile ranking, TreeSet searchedWords, indexContainer presortedContainer) {
|
|
|
|
// only for sorted containers
|
|
|
|
this.entryMin = null;
|
|
|
|
this.entryMax = null;
|
|
|
|
this.container = presortedContainer;
|
|
|
|
this.ranking = ranking;
|
|
|
|
this.searchedWords = searchedWords;
|
|
|
|
this.globalcount = 0;
|
|
|
|
this.urlhashes = new HashSet();
|
|
|
|
this.ref = new kelondroMScoreCluster();
|
|
|
|
this.query = query;
|
|
|
|
}
|
|
|
|
|
|
|
|
public void insert(indexRWIEntry entry, boolean local) {
|
|
|
|
// add the entry to the container into a position in such a way, that the container stays sorted
|
|
|
|
assert (entry != null);
|
|
|
|
|
|
|
|
// make a double-check: because different peers may have computed different ranking attributes,
|
|
|
|
// the double check cannot be made using the ranking and the insert position
|
|
|
|
if (urlhashes.contains(entry.urlHash())) return;
|
|
|
|
urlhashes.add(entry.urlHash());
|
|
|
|
|
|
|
|
// find new min/max borders
|
|
|
|
if (this.entryMin == null) this.entryMin = (indexRWIEntry) entry.clone(); else this.entryMin.min(entry);
|
|
|
|
if (this.entryMax == null) this.entryMax = (indexRWIEntry) entry.clone(); else this.entryMax.max(entry);
|
|
|
|
long pivot = this.ranking.preRanking(entry, this.entryMin, this.entryMax, this.searchedWords);
|
|
|
|
|
|
|
|
// insert the entry
|
|
|
|
int insertPosition = insertPosition(pivot);
|
|
|
|
|
|
|
|
// insert at found position
|
|
|
|
container.insertUnique(insertPosition, entry.toKelondroEntry());
|
|
|
|
|
|
|
|
// update counter
|
|
|
|
if (!local) this.globalcount++;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
public void insert(indexContainer c, boolean local, boolean presorted) {
|
|
|
|
if ((this.container.size() == 0) && (presorted)) {
|
|
|
|
this.container = c;
|
|
|
|
if (!local) this.globalcount = c.size();
|
|
|
|
} else {
|
|
|
|
Iterator i = c.entries();
|
|
|
|
while (i.hasNext()) {
|
|
|
|
insert((indexRWIEntry) i.next(), local);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private int insertPosition(long pivotRanking) {
|
|
|
|
return insertPosition(pivotRanking, 0, container.size());
|
|
|
|
}
|
|
|
|
|
|
|
|
private int insertPosition(long pivotRanking, int left /*including*/, int right /*excluding*/) {
|
|
|
|
if (right - left < 10) {
|
|
|
|
// do iterative search, less overhead
|
|
|
|
for (int i = left; i < right; i++) {
|
|
|
|
if (this.ranking.preRanking(new indexRWIEntry(container.get(i)), this.entryMin, this.entryMax, this.searchedWords) < pivotRanking) {
|
|
|
|
// we found the right insert position
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return right;
|
|
|
|
}
|
|
|
|
// find recursively
|
|
|
|
int middle = (left + right) / 2;
|
|
|
|
if (this.ranking.preRanking(new indexRWIEntry(container.get(middle)), this.entryMin, this.entryMax, this.searchedWords) < pivotRanking) {
|
|
|
|
// must be on the left side
|
|
|
|
return insertPosition(pivotRanking, left, middle);
|
|
|
|
} else {
|
|
|
|
// must be on the right side
|
|
|
|
return insertPosition(pivotRanking, middle + 1, right);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
public indexRWIEntry remove(String urlHash) {
|
|
|
|
return this.container.remove(urlHash);
|
|
|
|
}
|
|
|
|
|
|
|
|
public int removeEntries(Set urlHashes) {
|
|
|
|
return this.container.removeEntries(urlHashes);
|
|
|
|
}
|
|
|
|
|
|
|
|
public indexContainer container() {
|
|
|
|
return this.container;
|
|
|
|
}
|
|
|
|
|
|
|
|
public int getGlobalCount() {
|
|
|
|
return this.globalcount;
|
|
|
|
}
|
|
|
|
|
|
|
|
public Set getReferences(int count) {
|
|
|
|
// create a list of words that had been computed by statistics over all
|
|
|
|
// words that appeared in the url or the description of all urls
|
|
|
|
Object[] refs = ref.getScores(count, false, 2, Integer.MAX_VALUE);
|
|
|
|
TreeSet s = new TreeSet(String.CASE_INSENSITIVE_ORDER);
|
|
|
|
for (int i = 0; i < refs.length; i++) {
|
|
|
|
s.add((String) refs[i]);
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
public void addReferences(String[] words) {
|
|
|
|
String word;
|
|
|
|
for (int i = 0; i < words.length; i++) {
|
|
|
|
word = words[i].toLowerCase();
|
|
|
|
if ((word.length() > 2) &&
|
|
|
|
("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) &&
|
|
|
|
(!(query.queryHashes.contains(plasmaCondenser.word2hash(word)))))
|
|
|
|
ref.incScore(word);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
protected void addReferences(plasmaSearchEvent.ResultEntry resultEntry) {
|
|
|
|
// take out relevant information for reference computation
|
|
|
|
if ((resultEntry.url() == null) || (resultEntry.title() == null)) return;
|
|
|
|
String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
|
|
|
|
String[] descrcomps = resultEntry.title().toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description
|
|
|
|
|
|
|
|
// add references
|
|
|
|
addReferences(urlcomps);
|
|
|
|
addReferences(descrcomps);
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|