yacy_search_server/source/de/anomic/plasma/plasmaSearchContainer.java

// plasmaSearchContainer.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 29.8.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
// 
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


package de.anomic.plasma;

import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;

import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.kelondro.kelondroMScoreCluster;

public class plasmaSearchContainer {

    private indexRWIEntry entryMin, entryMax;
    private indexContainer container;
    private plasmaSearchRankingProfile ranking;
    private TreeSet searchedWords;
    private int globalcount;
    private HashSet urlhashes; // set for double-check
    private kelondroMScoreCluster ref;  // reference score computation for the commonSense heuristic
    private plasmaSearchQuery query;
    
    
    public plasmaSearchContainer(plasmaSearchQuery query, plasmaSearchRankingProfile ranking, TreeSet searchedWords) {
        this(query, ranking, searchedWords, plasmaWordIndex.emptyContainer(null, 0));
    }
    
    public plasmaSearchContainer(plasmaSearchQuery query, plasmaSearchRankingProfile ranking, TreeSet searchedWords, indexContainer presortedContainer) {
        // only for sorted containers
        this.entryMin = null;
        this.entryMax = null;
        this.container = presortedContainer;
        this.ranking = ranking;
        this.searchedWords = searchedWords;
        this.globalcount = 0;
        this.urlhashes = new HashSet();
        this.ref = new kelondroMScoreCluster();
        this.query = query;
    }
    
    public void insert(indexRWIEntry entry, boolean local) {
        // add the entry to the container into a position in such a way, that the container stays sorted
        assert (entry != null);
        
        // make a double-check: because different peers may have computed different ranking attributes,
        // the double check cannot be made using the ranking and the insert position
        if (urlhashes.contains(entry.urlHash())) return;
        urlhashes.add(entry.urlHash());
        
        // find new min/max borders
        if (this.entryMin == null) this.entryMin = (indexRWIEntry) entry.clone(); else this.entryMin.min(entry);
        if (this.entryMax == null) this.entryMax = (indexRWIEntry) entry.clone(); else this.entryMax.max(entry);
        long pivot = this.ranking.preRanking(entry, this.entryMin, this.entryMax, this.searchedWords);
        
        // insert the entry
        int insertPosition = insertPosition(pivot);
        
        // insert at found position
        container.insertUnique(insertPosition, entry.toKelondroEntry());
        
        // update counter
        if (!local) this.globalcount++;
        
    }
    
    public void insert(indexContainer c, boolean local, boolean presorted) {
        if ((this.container.size() == 0) && (presorted)) {
            this.container = c;
            if (!local) this.globalcount = c.size();
        } else {
            Iterator i = c.entries();
            while (i.hasNext()) {
                insert((indexRWIEntry) i.next(), local);
            }
        }
    }

    private int insertPosition(long pivotRanking) {
        return insertPosition(pivotRanking, 0, container.size());
    }
 
    private int insertPosition(long pivotRanking, int left /*including*/, int right /*excluding*/) {
        if (right - left < 10) {
            // do iterative search, less overhead
            for (int i = left; i < right; i++) {
                if (this.ranking.preRanking(new indexRWIEntry(container.get(i)), this.entryMin, this.entryMax, this.searchedWords) < pivotRanking) {
                    // we found the right insert position
                    return i;
                }
            }
            return right;
        }
        // find recursively
        int middle = (left + right) / 2;
        if (this.ranking.preRanking(new indexRWIEntry(container.get(middle)), this.entryMin, this.entryMax, this.searchedWords) < pivotRanking) {
            // must be on the left side
            return insertPosition(pivotRanking, left, middle);
        } else {
            // must be on the right side
            return insertPosition(pivotRanking, middle + 1, right);
        }
    }
    
    public indexRWIEntry remove(String urlHash) {
        return this.container.remove(urlHash);
    }
    
    public int removeEntries(Set urlHashes) {
        return this.container.removeEntries(urlHashes);
    }
    
    public indexContainer container() {
        return this.container;
    }
    
    public int getGlobalCount() {
        return this.globalcount;
    }
    
    public Set getReferences(int count) {
        // create a list of words that had been computed by statistics over all
        // words that appeared in the url or the description of all urls
        Object[] refs = ref.getScores(count, false, 2, Integer.MAX_VALUE);
        TreeSet s = new TreeSet(String.CASE_INSENSITIVE_ORDER);
        for (int i = 0; i < refs.length; i++) {
            s.add((String) refs[i]);
        }
        return s;
    }
    
    public void addReferences(String[] words) {
        String word;
        for (int i = 0; i < words.length; i++) {
            word = words[i].toLowerCase();
            if ((word.length() > 2) &&
                ("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) &&
                (!(query.queryHashes.contains(plasmaCondenser.word2hash(word)))))
                ref.incScore(word);
        }
    }
    
    protected void addReferences(plasmaSearchEvent.ResultEntry resultEntry) {
        // take out relevant information for reference computation
        if ((resultEntry.url() == null) || (resultEntry.title() == null)) return;
        String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
        String[] descrcomps = resultEntry.title().toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description
        
        // add references
        addReferences(urlcomps);
        addReferences(descrcomps);
    }
    
}
first version of next-generation search interface: - snippets are not fetched by browser using ajax, they are now fetched internally - YaCy-internat threads control existence of snippets and sort out bad results - search results are prepared using SSI includes - the search result page is visible right after the search request, the results drop in when they are detected - no more time-out strategy during search processes, results are shifted within queues when they arrive from remote peers - added result page switching! after the first 10 results, the next page can be retrieved - number of remote results is updated online on the result page as they drop in - removed old snippet servelet (which had been also a security leak btw) - media search is broken now, will be redesigned and fixed in another step git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4071 6c8d7289-2bf4-0310-a012-ef5d649a1542 18 years ago			`// plasmaSearchContainer.java`
			`// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany`
			`// first published 29.8.2007 on http://yacy.net`
			`//`
			`// This is a part of YaCy, a peer-to-peer based web search engine`
			`//`
			`// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $`
			`// $LastChangedRevision: 1986 $`
			`// $LastChangedBy: orbiter $`
			`//`
			`// LICENSE`
			`//`
			`// This program is free software; you can redistribute it and/or modify`
			`// it under the terms of the GNU General Public License as published by`
			`// the Free Software Foundation; either version 2 of the License, or`
			`// (at your option) any later version.`
			`//`
			`// This program is distributed in the hope that it will be useful,`
			`// but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`// GNU General Public License for more details.`
			`//`
			`// You should have received a copy of the GNU General Public License`
			`// along with this program; if not, write to the Free Software`
			`// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA`


			`package de.anomic.plasma;`

			`import java.util.HashSet;`
			`import java.util.Iterator;`
			`import java.util.Set;`
			`import java.util.TreeSet;`

			`import de.anomic.htmlFilter.htmlFilterContentScraper;`
			`import de.anomic.index.indexContainer;`
			`import de.anomic.index.indexRWIEntry;`
			`import de.anomic.kelondro.kelondroMScoreCluster;`

			`public class plasmaSearchContainer {`

			`private indexRWIEntry entryMin, entryMax;`
			`private indexContainer container;`
			`private plasmaSearchRankingProfile ranking;`
			`private TreeSet searchedWords;`
			`private int globalcount;`
			`private HashSet urlhashes; // set for double-check`
			`private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic`
			`private plasmaSearchQuery query;`


			`public plasmaSearchContainer(plasmaSearchQuery query, plasmaSearchRankingProfile ranking, TreeSet searchedWords) {`
			`this(query, ranking, searchedWords, plasmaWordIndex.emptyContainer(null, 0));`
			`}`

			`public plasmaSearchContainer(plasmaSearchQuery query, plasmaSearchRankingProfile ranking, TreeSet searchedWords, indexContainer presortedContainer) {`
			`// only for sorted containers`
			`this.entryMin = null;`
			`this.entryMax = null;`
			`this.container = presortedContainer;`
			`this.ranking = ranking;`
			`this.searchedWords = searchedWords;`
			`this.globalcount = 0;`
			`this.urlhashes = new HashSet();`
			`this.ref = new kelondroMScoreCluster();`
			`this.query = query;`
			`}`

			`public void insert(indexRWIEntry entry, boolean local) {`
			`// add the entry to the container into a position in such a way, that the container stays sorted`
			`assert (entry != null);`

			`// make a double-check: because different peers may have computed different ranking attributes,`
			`// the double check cannot be made using the ranking and the insert position`
			`if (urlhashes.contains(entry.urlHash())) return;`
			`urlhashes.add(entry.urlHash());`

			`// find new min/max borders`
			`if (this.entryMin == null) this.entryMin = (indexRWIEntry) entry.clone(); else this.entryMin.min(entry);`
			`if (this.entryMax == null) this.entryMax = (indexRWIEntry) entry.clone(); else this.entryMax.max(entry);`
			`long pivot = this.ranking.preRanking(entry, this.entryMin, this.entryMax, this.searchedWords);`

			`// insert the entry`
			`int insertPosition = insertPosition(pivot);`

			`// insert at found position`
			`container.insertUnique(insertPosition, entry.toKelondroEntry());`

			`// update counter`
			`if (!local) this.globalcount++;`

			`}`

			`public void insert(indexContainer c, boolean local, boolean presorted) {`
			`if ((this.container.size() == 0) && (presorted)) {`
			`this.container = c;`
			`if (!local) this.globalcount = c.size();`
			`} else {`
			`Iterator i = c.entries();`
			`while (i.hasNext()) {`
			`insert((indexRWIEntry) i.next(), local);`
			`}`
			`}`
			`}`

			`private int insertPosition(long pivotRanking) {`
			`return insertPosition(pivotRanking, 0, container.size());`
			`}`

			`private int insertPosition(long pivotRanking, int left /including/, int right /excluding/) {`
			`if (right - left < 10) {`
			`// do iterative search, less overhead`
			`for (int i = left; i < right; i++) {`
			`if (this.ranking.preRanking(new indexRWIEntry(container.get(i)), this.entryMin, this.entryMax, this.searchedWords) < pivotRanking) {`
			`// we found the right insert position`
			`return i;`
			`}`
			`}`
			`return right;`
			`}`
			`// find recursively`
			`int middle = (left + right) / 2;`
			`if (this.ranking.preRanking(new indexRWIEntry(container.get(middle)), this.entryMin, this.entryMax, this.searchedWords) < pivotRanking) {`
			`// must be on the left side`
			`return insertPosition(pivotRanking, left, middle);`
			`} else {`
			`// must be on the right side`
			`return insertPosition(pivotRanking, middle + 1, right);`
			`}`
			`}`

			`public indexRWIEntry remove(String urlHash) {`
			`return this.container.remove(urlHash);`
			`}`

			`public int removeEntries(Set urlHashes) {`
			`return this.container.removeEntries(urlHashes);`
			`}`

			`public indexContainer container() {`
			`return this.container;`
			`}`

			`public int getGlobalCount() {`
			`return this.globalcount;`
			`}`

re-implemented post-ranking of search results (should enhanced search result quality) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4080 6c8d7289-2bf4-0310-a012-ef5d649a1542 18 years ago			`public Set getReferences(int count) {`
first version of next-generation search interface: - snippets are not fetched by browser using ajax, they are now fetched internally - YaCy-internat threads control existence of snippets and sort out bad results - search results are prepared using SSI includes - the search result page is visible right after the search request, the results drop in when they are detected - no more time-out strategy during search processes, results are shifted within queues when they arrive from remote peers - added result page switching! after the first 10 results, the next page can be retrieved - number of remote results is updated online on the result page as they drop in - removed old snippet servelet (which had been also a security leak btw) - media search is broken now, will be redesigned and fixed in another step git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4071 6c8d7289-2bf4-0310-a012-ef5d649a1542 18 years ago			`// create a list of words that had been computed by statistics over all`
			`// words that appeared in the url or the description of all urls`
re-implemented post-ranking of search results (should enhanced search result quality) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4080 6c8d7289-2bf4-0310-a012-ef5d649a1542 18 years ago			`Object[] refs = ref.getScores(count, false, 2, Integer.MAX_VALUE);`
			`TreeSet s = new TreeSet(String.CASE_INSENSITIVE_ORDER);`
			`for (int i = 0; i < refs.length; i++) {`
			`s.add((String) refs[i]);`
			`}`
			`return s;`
first version of next-generation search interface: - snippets are not fetched by browser using ajax, they are now fetched internally - YaCy-internat threads control existence of snippets and sort out bad results - search results are prepared using SSI includes - the search result page is visible right after the search request, the results drop in when they are detected - no more time-out strategy during search processes, results are shifted within queues when they arrive from remote peers - added result page switching! after the first 10 results, the next page can be retrieved - number of remote results is updated online on the result page as they drop in - removed old snippet servelet (which had been also a security leak btw) - media search is broken now, will be redesigned and fixed in another step git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4071 6c8d7289-2bf4-0310-a012-ef5d649a1542 18 years ago			`}`

			`public void addReferences(String[] words) {`
			`String word;`
			`for (int i = 0; i < words.length; i++) {`
			`word = words[i].toLowerCase();`
			`if ((word.length() > 2) &&`
			`("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) &&`
			`(!(query.queryHashes.contains(plasmaCondenser.word2hash(word)))))`
			`ref.incScore(word);`
			`}`
			`}`

			`protected void addReferences(plasmaSearchEvent.ResultEntry resultEntry) {`
			`// take out relevant information for reference computation`
			`if ((resultEntry.url() == null) \|\| (resultEntry.title() == null)) return;`
			`String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url`
			`String[] descrcomps = resultEntry.title().toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description`

			`// add references`
			`addReferences(urlcomps);`
			`addReferences(descrcomps);`
			`}`

			`}`