// plasmaSearchPreOrder.java
// -----------------------
// part of YACY
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
// Created: 23.10.2005
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma ;
import java.io.File ;
import java.io.IOException ;
import java.util.HashSet ;
import java.util.Iterator ;
import java.util.Map ;
import java.util.TreeMap ;
import java.util.TreeSet ;
import de.anomic.index.indexContainer ;
import de.anomic.index.indexRWIEntry ;
import de.anomic.plasma.plasmaURL ;
import de.anomic.kelondro.kelondroBinSearch ;
import de.anomic.server.serverCodings ;
import de.anomic.server.serverFileUtils ;
public final class plasmaSearchPreOrder {
public static kelondroBinSearch [ ] ybrTables = null ; // block-rank tables
private static boolean useYBR = true ;
private indexRWIEntry entryMin , entryMax ;
private TreeMap pageAcc ; // key = order hash; value = plasmaLURL.entry
private plasmaSearchQuery query ;
private plasmaSearchRankingProfile ranking ;
private int filteredCount ;
public plasmaSearchPreOrder ( ) {
this . entryMin = null ;
this . entryMax = null ;
this . pageAcc = new TreeMap ( ) ;
this . query = null ;
this . ranking = null ;
}
public plasmaSearchPreOrder ( plasmaSearchQuery query , plasmaSearchRankingProfile ranking , indexContainer container , long maxTime ) {
this . query = query ;
this . ranking = ranking ;
long limitTime = ( maxTime < 0 ) ? Long . MAX_VALUE : System . currentTimeMillis ( ) + maxTime ;
indexRWIEntry iEntry ;
// first pass: find min/max to obtain limits for normalization
Iterator i = container . entries ( ) ;
int count = 0 ;
this . entryMin = null ;
this . entryMax = null ;
while ( i . hasNext ( ) ) {
if ( System . currentTimeMillis ( ) > limitTime ) break ;
iEntry = ( indexRWIEntry ) i . next ( ) ;
if ( this . entryMin = = null ) this . entryMin = ( indexRWIEntry ) iEntry . clone ( ) ; else this . entryMin . min ( iEntry ) ;
if ( this . entryMax = = null ) this . entryMax = ( indexRWIEntry ) iEntry . clone ( ) ; else this . entryMax . max ( iEntry ) ;
count + + ;
}
// second pass: normalize entries and get ranking
i = container . entries ( ) ;
this . pageAcc = new TreeMap ( ) ;
TreeSet searchWords = plasmaSearchQuery . cleanQuery ( query . queryString ) [ 0 ] ;
for ( int j = 0 ; j < count ; j + + ) {
iEntry = ( indexRWIEntry ) i . next ( ) ;
if ( iEntry . urlHash ( ) . length ( ) ! = container . row ( ) . width ( container . row ( ) . primaryKey ( ) ) ) continue ;
if ( ( ! ( query . constraint . equals ( plasmaSearchQuery . catchall_constraint ) ) ) & & ( ! ( iEntry . flags ( ) . allOf ( query . constraint ) ) ) ) continue ; // filter out entries that do not match the search constraint
if ( query . contentdom ! = plasmaSearchQuery . CONTENTDOM_TEXT ) {
if ( ( query . contentdom = = plasmaSearchQuery . CONTENTDOM_AUDIO ) & & ( ! ( iEntry . flags ( ) . get ( plasmaCondenser . flag_cat_hasaudio ) ) ) ) continue ;
if ( ( query . contentdom = = plasmaSearchQuery . CONTENTDOM_VIDEO ) & & ( ! ( iEntry . flags ( ) . get ( plasmaCondenser . flag_cat_hasvideo ) ) ) ) continue ;
if ( ( query . contentdom = = plasmaSearchQuery . CONTENTDOM_IMAGE ) & & ( ! ( iEntry . flags ( ) . get ( plasmaCondenser . flag_cat_hasimage ) ) ) ) continue ;
if ( ( query . contentdom = = plasmaSearchQuery . CONTENTDOM_APP ) & & ( ! ( iEntry . flags ( ) . get ( plasmaCondenser . flag_cat_hasapp ) ) ) ) continue ;
}
pageAcc . put ( serverCodings . encodeHex ( Long . MAX_VALUE - this . ranking . preRanking ( iEntry . generateNormalized ( this . entryMin , this . entryMax ) , searchWords ) , 16 ) + iEntry . urlHash ( ) , iEntry ) ;
}
this . filteredCount = pageAcc . size ( ) ;
}
public int filteredCount ( ) {
return this . filteredCount ;
}
public void remove ( boolean rootDomExt , boolean doubleDom ) {
// this removes all refererences to urls that are extended paths of existing 'RootDom'-urls
if ( pageAcc . size ( ) < = query . wantedResults ) return ;
HashSet rootDoms = new HashSet ( ) ;
HashSet doubleDoms = new HashSet ( ) ;
Iterator i = pageAcc . entrySet ( ) . iterator ( ) ;
Map . Entry entry ;
indexRWIEntry iEntry ;
String hashpart ;
boolean isWordRootURL ;
TreeSet querywords = plasmaSearchQuery . cleanQuery ( query . queryString ( ) ) [ 0 ] ;
while ( i . hasNext ( ) ) {
if ( pageAcc . size ( ) < = query . wantedResults ) break ;
entry = ( Map . Entry ) i . next ( ) ;
iEntry = ( indexRWIEntry ) entry . getValue ( ) ;
hashpart = iEntry . urlHash ( ) . substring ( 6 ) ;
isWordRootURL = plasmaURL . isWordRootURL ( iEntry . urlHash ( ) , querywords ) ;
if ( isWordRootURL ) {
rootDoms . add ( hashpart ) ;
} else {
if ( ( ( rootDomExt ) & & ( rootDoms . contains ( hashpart ) ) ) | |
( ( doubleDom ) & & ( doubleDoms . contains ( hashpart ) ) ) ) {
i . remove ( ) ;
}
}
doubleDoms . add ( hashpart ) ;
}
}
public static void loadYBR ( File rankingPath , int count ) {
// load ranking tables
if ( rankingPath . exists ( ) ) {
ybrTables = new kelondroBinSearch [ count ] ;
String ybrName ;
File f ;
try {
for ( int i = 0 ; i < count ; i + + ) {
ybrName = "YBR-4-" + serverCodings . encodeHex ( i , 2 ) + ".idx" ;
f = new File ( rankingPath , ybrName ) ;
if ( f . exists ( ) ) {
ybrTables [ i ] = new kelondroBinSearch ( serverFileUtils . read ( f ) , 6 ) ;
} else {
ybrTables [ i ] = null ;
}
}
} catch ( IOException e ) {
ybrTables = null ;
}
} else {
ybrTables = null ;
}
}
public static boolean canUseYBR ( ) {
return ybrTables ! = null ;
}
public static boolean isUsingYBR ( ) {
return useYBR ;
}
public static void switchYBR ( boolean usage ) {
useYBR = usage ;
}
public plasmaSearchPreOrder cloneSmart ( ) {
// clones only the top structure
plasmaSearchPreOrder theClone = new plasmaSearchPreOrder ( ) ;
theClone . query = this . query ;
theClone . ranking = this . ranking ;
theClone . pageAcc = ( TreeMap ) this . pageAcc . clone ( ) ;
return theClone ;
}
public boolean hasNext ( ) {
return pageAcc . size ( ) > 0 ;
}
public Object [ ] /*{indexEntry, Long}*/ next ( ) {
String top = ( String ) pageAcc . firstKey ( ) ;
//System.out.println("preorder-key: " + top);
Long preranking ;
try {
preranking = new Long ( Long . MAX_VALUE - Long . parseLong ( top . substring ( 0 , 16 ) , 16 ) ) ; // java.lang.NumberFormatException: For input string: "8000000000020b17" ???
} catch ( NumberFormatException e ) {
e . printStackTrace ( ) ;
preranking = new Long ( 0 ) ;
}
return new Object [ ] { ( indexRWIEntry ) pageAcc . remove ( top ) , preranking } ;
}
public indexRWIEntry [ ] getNormalizer ( ) {
return new indexRWIEntry [ ] { entryMin , entryMax } ;
}
public static int ybr_p ( String urlHash ) {
return 16 * ( 16 - ybr ( urlHash ) ) ;
}
public static int ybr ( String urlHash ) {
if ( ybrTables = = null ) return 16 ;
if ( ! ( useYBR ) ) return 16 ;
final String domHash = urlHash . substring ( 6 ) ;
for ( int i = 0 ; i < ybrTables . length ; i + + ) {
if ( ( ybrTables [ i ] ! = null ) & & ( ybrTables [ i ] . contains ( domHash . getBytes ( ) ) ) ) {
//System.out.println("YBR FOUND: " + urlHash + " (" + i + ")");
return i ;
}
}
//System.out.println("NOT FOUND: " + urlHash);
return 16 ;
}
}