@ -65,8 +65,10 @@ import net.yacy.search.ranking.ReferenceOrder;
import net.yacy.search.snippet.ContentDomain ;
import net.yacy.search.snippet.ContentDomain ;
import net.yacy.search.snippet.ResultEntry ;
import net.yacy.search.snippet.ResultEntry ;
public final class RWIProcess extends Thread {
public final class RWIProcess extends Thread
{
private static final long maxWaitPerResult = 30 ;
private static final int maxDoubleDomAll = 1000 , maxDoubleDomSpecial = 10000 ;
private static final int maxDoubleDomAll = 1000 , maxDoubleDomSpecial = 10000 ;
private final QueryParams query ;
private final QueryParams query ;
@ -79,6 +81,8 @@ public final class RWIProcess extends Thread {
private int remote_resourceSize , remote_indexCount , remote_peerCount ;
private int remote_resourceSize , remote_indexCount , remote_peerCount ;
private int local_indexCount ;
private int local_indexCount ;
private int initialExpectedRemoteReferences ;
private final AtomicInteger expectedRemoteReferences , receivedRemoteReferences ;
private final WeakPriorityBlockingQueue < WordReferenceVars > stack ;
private final WeakPriorityBlockingQueue < WordReferenceVars > stack ;
private final AtomicInteger feeders ;
private final AtomicInteger feeders ;
private final ConcurrentHashMap < String , WeakPriorityBlockingQueue < WordReferenceVars > > doubleDomCache ; // key = domhash (6 bytes); value = like stack
private final ConcurrentHashMap < String , WeakPriorityBlockingQueue < WordReferenceVars > > doubleDomCache ; // key = domhash (6 bytes); value = like stack
@ -97,7 +101,6 @@ public final class RWIProcess extends Thread {
private final ScoreMap < String > protocolNavigator ; // a counter for protocol types
private final ScoreMap < String > protocolNavigator ; // a counter for protocol types
private final ScoreMap < String > filetypeNavigator ; // a counter for file types
private final ScoreMap < String > filetypeNavigator ; // a counter for file types
public RWIProcess ( final QueryParams query , final ReferenceOrder order , final int maxentries ) {
public RWIProcess ( final QueryParams query , final ReferenceOrder order , final int maxentries ) {
// we collect the urlhashes and construct a list with urlEntry objects
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
// attention: if minEntries is too high, this method will not terminate within the maxTime
@ -112,11 +115,15 @@ public final class RWIProcess extends Thread {
this . remote_resourceSize = 0 ;
this . remote_resourceSize = 0 ;
this . remote_indexCount = 0 ;
this . remote_indexCount = 0 ;
this . local_indexCount = 0 ;
this . local_indexCount = 0 ;
this . urlhashes = new HandleSet ( URIMetadataRow . rowdef . primaryKeyLength , URIMetadataRow . rowdef . objectOrder , 100 ) ;
this . urlhashes =
this . misses = new HandleSet ( URIMetadataRow . rowdef . primaryKeyLength , URIMetadataRow . rowdef . objectOrder , 100 ) ;
new HandleSet ( URIMetadataRow . rowdef . primaryKeyLength , URIMetadataRow . rowdef . objectOrder , 100 ) ;
this . misses =
new HandleSet ( URIMetadataRow . rowdef . primaryKeyLength , URIMetadataRow . rowdef . objectOrder , 100 ) ;
this . sortout = 0 ;
this . sortout = 0 ;
this . flagcount = new int [ 32 ] ;
this . flagcount = new int [ 32 ] ;
for ( int i = 0 ; i < 32 ; i + + ) { this . flagcount [ i ] = 0 ; }
for ( int i = 0 ; i < 32 ; i + + ) {
this . flagcount [ i ] = 0 ;
}
this . hostNavigator = new ConcurrentScoreMap < String > ( ) ;
this . hostNavigator = new ConcurrentScoreMap < String > ( ) ;
this . hostResolver = new ConcurrentHashMap < String , byte [ ] > ( ) ;
this . hostResolver = new ConcurrentHashMap < String , byte [ ] > ( ) ;
this . authorNavigator = new ConcurrentScoreMap < String > ( ) ;
this . authorNavigator = new ConcurrentScoreMap < String > ( ) ;
@ -126,6 +133,18 @@ public final class RWIProcess extends Thread {
this . ref = new ConcurrentScoreMap < String > ( ) ;
this . ref = new ConcurrentScoreMap < String > ( ) ;
this . feeders = new AtomicInteger ( 1 ) ;
this . feeders = new AtomicInteger ( 1 ) ;
this . startTime = System . currentTimeMillis ( ) ;
this . startTime = System . currentTimeMillis ( ) ;
this . initialExpectedRemoteReferences = 0 ;
this . expectedRemoteReferences = new AtomicInteger ( 0 ) ;
this . receivedRemoteReferences = new AtomicInteger ( 0 ) ;
}
public void setExpectedRemoteReferences ( int expectedRemoteReferences ) {
this . initialExpectedRemoteReferences = expectedRemoteReferences ;
this . expectedRemoteReferences . set ( expectedRemoteReferences ) ;
}
public void decExpectedRemoteReferences ( int x ) {
this . expectedRemoteReferences . addAndGet ( - x ) ;
}
}
public QueryParams getQuery ( ) {
public QueryParams getQuery ( ) {
@ -144,7 +163,11 @@ public final class RWIProcess extends Thread {
// so following sortings together with the global results will be fast
// so following sortings together with the global results will be fast
try {
try {
final long timer = System . currentTimeMillis ( ) ;
final long timer = System . currentTimeMillis ( ) ;
final TermSearch < WordReference > search = this . query . getSegment ( ) . termIndex ( ) . query (
final TermSearch < WordReference > search =
this . query
. getSegment ( )
. termIndex ( )
. query (
this . query . queryHashes ,
this . query . queryHashes ,
this . query . excludeHashes ,
this . query . excludeHashes ,
null ,
null ,
@ -152,7 +175,15 @@ public final class RWIProcess extends Thread {
this . query . maxDistance ) ;
this . query . maxDistance ) ;
this . localSearchInclusion = search . inclusion ( ) ;
this . localSearchInclusion = search . inclusion ( ) ;
final ReferenceContainer < WordReference > index = search . joined ( ) ;
final ReferenceContainer < WordReference > index = search . joined ( ) ;
EventTracker . update ( EventTracker . EClass . SEARCH , new ProfilingGraph . EventSearch ( this . query . id ( true ) , SearchEvent . Type . JOIN , this . query . queryString , index . size ( ) , System . currentTimeMillis ( ) - timer ) , false ) ;
EventTracker . update (
EventTracker . EClass . SEARCH ,
new ProfilingGraph . EventSearch (
this . query . id ( true ) ,
SearchEvent . Type . JOIN ,
this . query . queryString ,
index . size ( ) ,
System . currentTimeMillis ( ) - timer ) ,
false ) ;
if ( ! index . isEmpty ( ) ) {
if ( ! index . isEmpty ( ) ) {
add ( index , true , "local index: " + this . query . getSegment ( ) . getLocation ( ) , - 1 , true ) ;
add ( index , true , "local index: " + this . query . getSegment ( ) . getLocation ( ) , - 1 , true ) ;
}
}
@ -176,7 +207,9 @@ public final class RWIProcess extends Thread {
this . addRunning = true ;
this . addRunning = true ;
assert ( index ! = null ) ;
assert ( index ! = null ) ;
if ( index . isEmpty ( ) ) return ;
if ( index . isEmpty ( ) ) {
return ;
}
if ( ! local ) {
if ( ! local ) {
assert fullResource > = 0 : "fullResource = " + fullResource ;
assert fullResource > = 0 : "fullResource = " + fullResource ;
@ -188,27 +221,42 @@ public final class RWIProcess extends Thread {
// normalize entries
// normalize entries
final BlockingQueue < WordReferenceVars > decodedEntries = this . order . normalizeWith ( index ) ;
final BlockingQueue < WordReferenceVars > decodedEntries = this . order . normalizeWith ( index ) ;
EventTracker . update ( EventTracker . EClass . SEARCH , new ProfilingGraph . EventSearch ( this . query . id ( true ) , SearchEvent . Type . NORMALIZING , resourceName , index . size ( ) , System . currentTimeMillis ( ) - timer ) , false ) ;
EventTracker . update ( EventTracker . EClass . SEARCH , new ProfilingGraph . EventSearch (
this . query . id ( true ) ,
SearchEvent . Type . NORMALIZING ,
resourceName ,
index . size ( ) ,
System . currentTimeMillis ( ) - timer ) , false ) ;
this . receivedRemoteReferences . addAndGet ( index . size ( ) ) ;
// iterate over normalized entries and select some that are better than currently stored
// iterate over normalized entries and select some that are better than currently stored
timer = System . currentTimeMillis ( ) ;
timer = System . currentTimeMillis ( ) ;
final boolean nav_hosts = this . query . navigators . equals ( "all" ) | | this . query . navigators . indexOf ( "hosts" , 0 ) > = 0 ;
final boolean nav_hosts =
this . query . navigators . equals ( "all" ) | | this . query . navigators . indexOf ( "hosts" , 0 ) > = 0 ;
// apply all constraints
// apply all constraints
try {
try {
WordReferenceVars iEntry ;
WordReferenceVars iEntry ;
final String pattern = this . query . urlMask . pattern ( ) ;
final String pattern = this . query . urlMask . pattern ( ) ;
final boolean httpPattern = pattern . equals ( "http://.*" ) ;
final boolean httpPattern = pattern . equals ( "http://.*" ) ;
final boolean noHttpButProtocolPattern = pattern . equals ( "https://.*" ) | | pattern . equals ( "ftp://.*" ) | | pattern . equals ( "smb://.*" ) | | pattern . equals ( "file://.*" ) ;
final boolean noHttpButProtocolPattern =
pattern . equals ( "https://.*" )
| | pattern . equals ( "ftp://.*" )
| | pattern . equals ( "smb://.*" )
| | pattern . equals ( "file://.*" ) ;
pollloop : while ( true ) {
pollloop : while ( true ) {
iEntry = decodedEntries . poll ( 1 , TimeUnit . SECONDS ) ;
iEntry = decodedEntries . poll ( 1 , TimeUnit . SECONDS ) ;
if ( iEntry = = null | | iEntry = = WordReferenceVars . poison ) break pollloop ;
if ( iEntry = = null | | iEntry = = WordReferenceVars . poison ) {
break pollloop ;
}
assert ( iEntry . urlhash ( ) . length = = index . row ( ) . primaryKeyLength ) ;
assert ( iEntry . urlhash ( ) . length = = index . row ( ) . primaryKeyLength ) ;
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
// increase flag counts
// increase flag counts
for ( int j = 0 ; j < 32 ; j + + ) {
for ( int j = 0 ; j < 32 ; j + + ) {
if ( iEntry . flags ( ) . get ( j ) ) { this . flagcount [ j ] + + ; }
if ( iEntry . flags ( ) . get ( j ) ) {
this . flagcount [ j ] + + ;
}
}
}
// check constraints
// check constraints
@ -218,10 +266,22 @@ public final class RWIProcess extends Thread {
// check document domain
// check document domain
if ( this . query . contentdom ! = ContentDomain . TEXT ) {
if ( this . query . contentdom ! = ContentDomain . TEXT ) {
if ( ( this . query . contentdom = = ContentDomain . AUDIO ) & & ( ! ( iEntry . flags ( ) . get ( Condenser . flag_cat_hasaudio ) ) ) ) { continue pollloop ; }
if ( ( this . query . contentdom = = ContentDomain . AUDIO )
if ( ( this . query . contentdom = = ContentDomain . VIDEO ) & & ( ! ( iEntry . flags ( ) . get ( Condenser . flag_cat_hasvideo ) ) ) ) { continue pollloop ; }
& & ( ! ( iEntry . flags ( ) . get ( Condenser . flag_cat_hasaudio ) ) ) ) {
if ( ( this . query . contentdom = = ContentDomain . IMAGE ) & & ( ! ( iEntry . flags ( ) . get ( Condenser . flag_cat_hasimage ) ) ) ) { continue pollloop ; }
continue pollloop ;
if ( ( this . query . contentdom = = ContentDomain . APP ) & & ( ! ( iEntry . flags ( ) . get ( Condenser . flag_cat_hasapp ) ) ) ) { continue pollloop ; }
}
if ( ( this . query . contentdom = = ContentDomain . VIDEO )
& & ( ! ( iEntry . flags ( ) . get ( Condenser . flag_cat_hasvideo ) ) ) ) {
continue pollloop ;
}
if ( ( this . query . contentdom = = ContentDomain . IMAGE )
& & ( ! ( iEntry . flags ( ) . get ( Condenser . flag_cat_hasimage ) ) ) ) {
continue pollloop ;
}
if ( ( this . query . contentdom = = ContentDomain . APP )
& & ( ! ( iEntry . flags ( ) . get ( Condenser . flag_cat_hasapp ) ) ) ) {
continue pollloop ;
}
}
}
// check tld domain
// check tld domain
@ -254,8 +314,12 @@ public final class RWIProcess extends Thread {
// check protocol
// check protocol
if ( ! this . query . urlMask_isCatchall ) {
if ( ! this . query . urlMask_isCatchall ) {
final boolean httpFlagSet = DigestURI . flag4HTTPset ( iEntry . urlHash ) ;
final boolean httpFlagSet = DigestURI . flag4HTTPset ( iEntry . urlHash ) ;
if ( httpPattern & & ! httpFlagSet ) continue pollloop ;
if ( httpPattern & & ! httpFlagSet ) {
if ( noHttpButProtocolPattern & & httpFlagSet ) continue pollloop ;
continue pollloop ;
}
if ( noHttpButProtocolPattern & & httpFlagSet ) {
continue pollloop ;
}
}
}
// finally make a double-check and insert result to stack
// finally make a double-check and insert result to stack
@ -264,7 +328,8 @@ public final class RWIProcess extends Thread {
this . urlhashes . putUnique ( iEntry . urlhash ( ) ) ;
this . urlhashes . putUnique ( iEntry . urlhash ( ) ) ;
rankingtryloop : while ( true ) {
rankingtryloop : while ( true ) {
try {
try {
this . stack . put ( new ReverseElement < WordReferenceVars > ( iEntry , this . order . cardinal ( iEntry ) ) ) ; // inserts the element and removes the worst (which is smallest)
this . stack . put ( new ReverseElement < WordReferenceVars > ( iEntry , this . order
. cardinal ( iEntry ) ) ) ; // inserts the element and removes the worst (which is smallest)
break rankingtryloop ;
break rankingtryloop ;
} catch ( final ArithmeticException e ) {
} catch ( final ArithmeticException e ) {
// this may happen if the concurrent normalizer changes values during cardinal computation
// this may happen if the concurrent normalizer changes values during cardinal computation
@ -272,16 +337,29 @@ public final class RWIProcess extends Thread {
}
}
}
}
// increase counter for statistics
// increase counter for statistics
if ( local ) this . local_indexCount + + ; else this . remote_indexCount + + ;
if ( local ) {
this . local_indexCount + + ;
} else {
this . remote_indexCount + + ;
//}
//}
}
}
}
} catch ( final InterruptedException e ) { } catch ( final RowSpaceExceededException e ) { } finally {
} catch ( final InterruptedException e ) {
if ( finalizeAddAtEnd ) this . addRunning = false ;
} catch ( final RowSpaceExceededException e ) {
} finally {
if ( finalizeAddAtEnd ) {
this . addRunning = false ;
}
}
}
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
EventTracker . update ( EventTracker . EClass . SEARCH , new ProfilingGraph . EventSearch ( this . query . id ( true ) , SearchEvent . Type . PRESORT , resourceName , index . size ( ) , System . currentTimeMillis ( ) - timer ) , false ) ;
EventTracker . update ( EventTracker . EClass . SEARCH , new ProfilingGraph . EventSearch (
this . query . id ( true ) ,
SearchEvent . Type . PRESORT ,
resourceName ,
index . size ( ) ,
System . currentTimeMillis ( ) - timer ) , false ) ;
}
}
/ * *
/ * *
@ -301,18 +379,24 @@ public final class RWIProcess extends Thread {
}
}
private boolean testFlags ( final WordReference ientry ) {
private boolean testFlags ( final WordReference ientry ) {
if ( this . query . constraint = = null ) return true ;
if ( this . query . constraint = = null ) {
return true ;
}
// test if ientry matches with filter
// test if ientry matches with filter
// if all = true: let only entries pass that has all matching bits
// if all = true: let only entries pass that has all matching bits
// if all = false: let all entries pass that has at least one matching bit
// if all = false: let all entries pass that has at least one matching bit
if ( this . query . allofconstraint ) {
if ( this . query . allofconstraint ) {
for ( int i = 0 ; i < 32 ; i + + ) {
for ( int i = 0 ; i < 32 ; i + + ) {
if ( ( this . query . constraint . get ( i ) ) & & ( ! ientry . flags ( ) . get ( i ) ) ) return false ;
if ( ( this . query . constraint . get ( i ) ) & & ( ! ientry . flags ( ) . get ( i ) ) ) {
return false ;
}
}
}
return true ;
return true ;
}
}
for ( int i = 0 ; i < 32 ; i + + ) {
for ( int i = 0 ; i < 32 ; i + + ) {
if ( ( this . query . constraint . get ( i ) ) & & ( ientry . flags ( ) . get ( i ) ) ) return true ;
if ( ( this . query . constraint . get ( i ) ) & & ( ientry . flags ( ) . get ( i ) ) ) {
return true ;
}
}
}
return false ;
return false ;
}
}
@ -323,7 +407,9 @@ public final class RWIProcess extends Thread {
return this . localSearchInclusion ;
return this . localSearchInclusion ;
}
}
private WeakPriorityBlockingQueue . Element < WordReferenceVars > takeRWI ( final boolean skipDoubleDom , final long waitingtime ) {
private WeakPriorityBlockingQueue . Element < WordReferenceVars > takeRWI (
final boolean skipDoubleDom ,
final long waitingtime ) {
// returns from the current RWI list the best entry and removes this entry from the list
// returns from the current RWI list the best entry and removes this entry from the list
WeakPriorityBlockingQueue < WordReferenceVars > m ;
WeakPriorityBlockingQueue < WordReferenceVars > m ;
@ -334,16 +420,36 @@ public final class RWIProcess extends Thread {
//System.out.println("stack.poll: feeders = " + this.feeders + ", stack.sizeQueue = " + stack.sizeQueue());
//System.out.println("stack.poll: feeders = " + this.feeders + ", stack.sizeQueue = " + stack.sizeQueue());
int loops = 0 ; // a loop counter to terminate the reading if all the results are from the same domain
int loops = 0 ; // a loop counter to terminate the reading if all the results are from the same domain
final long timeout = System . currentTimeMillis ( ) + waitingtime ;
final long timeout = System . currentTimeMillis ( ) + waitingtime ;
while ( ( ( ! feedingIsFinished ( ) & & this . addRunning ) | | this . stack . sizeQueue ( ) > 0 ) & &
// wait some time if we did not get so much remote results so far to get a better ranking over remote results
( this . query . itemsPerPage < 1 | | loops + + < this . query . itemsPerPage ) ) {
// we wait at most 30 milliseconds to get a maximum total waiting time of 300 milliseconds for 10 results
long wait =
this . receivedRemoteReferences . get ( ) = = 0 ? maxWaitPerResult : Math . min (
maxWaitPerResult ,
maxWaitPerResult
* this . initialExpectedRemoteReferences
/ this . receivedRemoteReferences . get ( ) ) ;
if ( wait > 0 ) {
Thread . sleep ( wait ) ;
}
// loop as long as we can expect that we should get more results
while ( ( ( ! feedingIsFinished ( ) & & this . addRunning ) | | this . stack . sizeQueue ( ) > 0 )
& & ( this . query . itemsPerPage < 1 | | loops + + < this . query . itemsPerPage ) ) {
if ( waitingtime < = 0 ) {
if ( waitingtime < = 0 ) {
rwi = this . stack . poll ( ) ;
rwi = this . stack . poll ( ) ;
} else timeoutloop : while ( System . currentTimeMillis ( ) < timeout ) {
} else {
if ( feedingIsFinished ( ) & & this . stack . sizeQueue ( ) = = 0 ) break timeoutloop ;
timeoutloop : while ( System . currentTimeMillis ( ) < timeout ) {
if ( feedingIsFinished ( ) & & this . stack . sizeQueue ( ) = = 0 ) {
break timeoutloop ;
}
rwi = this . stack . poll ( 50 ) ;
rwi = this . stack . poll ( 50 ) ;
if ( rwi ! = null ) break timeoutloop ;
if ( rwi ! = null ) {
break timeoutloop ;
}
}
}
if ( rwi = = null ) {
break ;
}
}
if ( rwi = = null ) break ;
if ( ! skipDoubleDom ) {
if ( ! skipDoubleDom ) {
//System.out.println("!skipDoubleDom");
//System.out.println("!skipDoubleDom");
return rwi ;
return rwi ;
@ -355,7 +461,10 @@ public final class RWIProcess extends Thread {
m = this . doubleDomCache . get ( hosthash ) ;
m = this . doubleDomCache . get ( hosthash ) ;
if ( m = = null ) {
if ( m = = null ) {
// first appearance of dom. we create an entry to signal that one of that domain was already returned
// first appearance of dom. we create an entry to signal that one of that domain was already returned
m = new WeakPriorityBlockingQueue < WordReferenceVars > ( ( this . query . specialRights ) ? maxDoubleDomSpecial : maxDoubleDomAll ) ;
m =
new WeakPriorityBlockingQueue < WordReferenceVars > ( ( this . query . specialRights )
? maxDoubleDomSpecial
: maxDoubleDomAll ) ;
this . doubleDomCache . put ( hosthash , m ) ;
this . doubleDomCache . put ( hosthash , m ) ;
return rwi ;
return rwi ;
}
}
@ -363,15 +472,19 @@ public final class RWIProcess extends Thread {
m . put ( rwi ) ;
m . put ( rwi ) ;
}
}
}
}
} catch ( final InterruptedException e1 ) { }
} catch ( final InterruptedException e1 ) {
if ( this . doubleDomCache . isEmpty ( ) ) return null ;
}
if ( this . doubleDomCache . isEmpty ( ) ) {
return null ;
}
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
// find best entry from all caches
// find best entry from all caches
WeakPriorityBlockingQueue . Element < WordReferenceVars > bestEntry = null ;
WeakPriorityBlockingQueue . Element < WordReferenceVars > bestEntry = null ;
WeakPriorityBlockingQueue . Element < WordReferenceVars > o ;
WeakPriorityBlockingQueue . Element < WordReferenceVars > o ;
synchronized ( this . doubleDomCache ) {
synchronized ( this . doubleDomCache ) {
final Iterator < WeakPriorityBlockingQueue < WordReferenceVars > > i = this . doubleDomCache . values ( ) . iterator ( ) ;
final Iterator < WeakPriorityBlockingQueue < WordReferenceVars > > i =
this . doubleDomCache . values ( ) . iterator ( ) ;
while ( i . hasNext ( ) ) {
while ( i . hasNext ( ) ) {
try {
try {
m = i . next ( ) ;
m = i . next ( ) ;
@ -379,19 +492,27 @@ public final class RWIProcess extends Thread {
Log . logException ( e ) ;
Log . logException ( e ) ;
continue ; // not the best solution...
continue ; // not the best solution...
}
}
if ( m = = null ) continue ;
if ( m = = null ) {
if ( m . isEmpty ( ) ) continue ;
continue ;
}
if ( m . isEmpty ( ) ) {
continue ;
}
if ( bestEntry = = null ) {
if ( bestEntry = = null ) {
bestEntry = m . peek ( ) ;
bestEntry = m . peek ( ) ;
continue ;
continue ;
}
}
o = m . peek ( ) ;
o = m . peek ( ) ;
if ( o = = null ) continue ;
if ( o = = null ) {
continue ;
}
if ( o . getWeight ( ) < bestEntry . getWeight ( ) ) {
if ( o . getWeight ( ) < bestEntry . getWeight ( ) ) {
bestEntry = o ;
bestEntry = o ;
}
}
}
}
if ( bestEntry = = null ) return null ;
if ( bestEntry = = null ) {
return null ;
}
// finally remove the best entry from the doubledom cache
// finally remove the best entry from the doubledom cache
m = this . doubleDomCache . get ( bestEntry . getElement ( ) . hosthash ( ) ) ;
m = this . doubleDomCache . get ( bestEntry . getElement ( ) . hosthash ( ) ) ;
@ -401,10 +522,11 @@ public final class RWIProcess extends Thread {
}
}
/ * *
/ * *
* get one metadata entry from the ranked results . This will be the ' best ' entry so far
* get one metadata entry from the ranked results . This will be the ' best ' entry so far according to the
* according to the applied ranking . If there are no more entries left or the timeout
* applied ranking . If there are no more entries left or the timeout limit is reached then null is
* limit is reached then null is returned . The caller may distinguish the timeout case
* returned . The caller may distinguish the timeout case from the case where there will be no more also in
* from the case where there will be no more also in the future by calling this . feedingIsFinished ( )
* the future by calling this . feedingIsFinished ( )
*
* @param skipDoubleDom should be true if it is wanted that double domain entries are skipped
* @param skipDoubleDom should be true if it is wanted that double domain entries are skipped
* @param waitingtime the time this method may take for a result computation
* @param waitingtime the time this method may take for a result computation
* @return a metadata entry for a url
* @return a metadata entry for a url
@ -416,8 +538,11 @@ public final class RWIProcess extends Thread {
long timeleft ;
long timeleft ;
while ( ( timeleft = timeout - System . currentTimeMillis ( ) ) > 0 ) {
while ( ( timeleft = timeout - System . currentTimeMillis ( ) ) > 0 ) {
//System.out.println("timeleft = " + timeleft);
//System.out.println("timeleft = " + timeleft);
final WeakPriorityBlockingQueue . Element < WordReferenceVars > obrwi = takeRWI ( skipDoubleDom , timeleft ) ;
final WeakPriorityBlockingQueue . Element < WordReferenceVars > obrwi =
if ( obrwi = = null ) return null ; // all time was already wasted in takeRWI to get another element
takeRWI ( skipDoubleDom , timeleft ) ;
if ( obrwi = = null ) {
return null ; // all time was already wasted in takeRWI to get another element
}
final URIMetadataRow page = this . query . getSegment ( ) . urlMetadata ( ) . load ( obrwi ) ;
final URIMetadataRow page = this . query . getSegment ( ) . urlMetadata ( ) . load ( obrwi ) ;
if ( page = = null ) {
if ( page = = null ) {
try {
try {
@ -462,17 +587,17 @@ public final class RWIProcess extends Thread {
final String pagetitle = metadata . dc_title ( ) . toLowerCase ( ) ;
final String pagetitle = metadata . dc_title ( ) . toLowerCase ( ) ;
// check exclusion
// check exclusion
if ( ( QueryParams . anymatch ( pagetitle , this . query . excludeHashes ) ) | |
if ( ( QueryParams . anymatch ( pagetitle , this . query . excludeHashes ) )
(QueryParams . anymatch ( pageurl . toLowerCase ( ) , this . query . excludeHashes ) ) | |
|| (QueryParams . anymatch ( pageurl . toLowerCase ( ) , this . query . excludeHashes ) )
(QueryParams . anymatch ( pageauthor . toLowerCase ( ) , this . query . excludeHashes ) ) ) {
|| (QueryParams . anymatch ( pageauthor . toLowerCase ( ) , this . query . excludeHashes ) ) ) {
this . sortout + + ;
this . sortout + + ;
continue ;
continue ;
}
}
// check index-of constraint
// check index-of constraint
if ( ( this . query . constraint ! = null ) & &
if ( ( this . query . constraint ! = null )
(this . query . constraint . get ( Condenser . flag_cat_indexof ) ) & &
&& (this . query . constraint . get ( Condenser . flag_cat_indexof ) )
(! ( pagetitle . startsWith ( "index of" ) ) ) ) {
&& (! ( pagetitle . startsWith ( "index of" ) ) ) ) {
final Iterator < byte [ ] > wi = this . query . queryHashes . iterator ( ) ;
final Iterator < byte [ ] > wi = this . query . queryHashes . iterator ( ) ;
while ( wi . hasNext ( ) ) {
while ( wi . hasNext ( ) ) {
this . query . getSegment ( ) . termIndex ( ) . removeDelayed ( wi . next ( ) , page . hash ( ) ) ;
this . query . getSegment ( ) . termIndex ( ) . removeDelayed ( wi . next ( ) , page . hash ( ) ) ;
@ -482,18 +607,18 @@ public final class RWIProcess extends Thread {
}
}
// check location constraint
// check location constraint
if ( ( this . query . constraint ! = null ) & &
if ( ( this . query . constraint ! = null )
(this . query . constraint . get ( Condenser . flag_cat_haslocation ) ) & &
&& (this . query . constraint . get ( Condenser . flag_cat_haslocation ) )
(metadata . lat ( ) = = 0.0f | | metadata . lon ( ) = = 0.0f ) ) {
&& (metadata . lat ( ) = = 0.0f | | metadata . lon ( ) = = 0.0f ) ) {
this . sortout + + ;
this . sortout + + ;
continue ;
continue ;
}
}
// check content domain
// check content domain
if ( ( this . query . contentdom = = ContentDomain . AUDIO & & page . laudio ( ) = = 0 ) | |
if ( ( this . query . contentdom = = ContentDomain . AUDIO & & page . laudio ( ) = = 0 )
(this . query . contentdom = = ContentDomain . VIDEO & & page . lvideo ( ) = = 0 ) | |
|| (this . query . contentdom = = ContentDomain . VIDEO & & page . lvideo ( ) = = 0 )
(this . query . contentdom = = ContentDomain . IMAGE & & page . limage ( ) = = 0 ) | |
|| (this . query . contentdom = = ContentDomain . IMAGE & & page . limage ( ) = = 0 )
(this . query . contentdom = = ContentDomain . APP & & page . lapp ( ) = = 0 ) ) {
|| (this . query . contentdom = = ContentDomain . APP & & page . lapp ( ) = = 0 ) ) {
this . sortout + + ;
this . sortout + + ;
continue ;
continue ;
}
}
@ -534,7 +659,9 @@ public final class RWIProcess extends Thread {
// file type navigation
// file type navigation
final String fileext = metadata . url ( ) . getFileExtension ( ) ;
final String fileext = metadata . url ( ) . getFileExtension ( ) ;
if ( fileext . length ( ) > 0 ) this . filetypeNavigator . inc ( fileext ) ;
if ( fileext . length ( ) > 0 ) {
this . filetypeNavigator . inc ( fileext ) ;
}
// check Scanner
// check Scanner
if ( ! Scanner . acceptURL ( metadata . url ( ) ) ) {
if ( ! Scanner . acceptURL ( metadata . url ( ) ) ) {
@ -565,9 +692,13 @@ public final class RWIProcess extends Thread {
}
}
public boolean isEmpty ( ) {
public boolean isEmpty ( ) {
if ( ! this . stack . isEmpty ( ) ) return false ;
if ( ! this . stack . isEmpty ( ) ) {
return false ;
}
for ( final WeakPriorityBlockingQueue < WordReferenceVars > s : this . doubleDomCache . values ( ) ) {
for ( final WeakPriorityBlockingQueue < WordReferenceVars > s : this . doubleDomCache . values ( ) ) {
if ( ! s . isEmpty ( ) ) return false ;
if ( ! s . isEmpty ( ) ) {
return false ;
}
}
}
return true ;
return true ;
}
}
@ -616,22 +747,31 @@ public final class RWIProcess extends Thread {
}
}
public ScoreMap < String > getNamespaceNavigator ( ) {
public ScoreMap < String > getNamespaceNavigator ( ) {
if ( ! this . query . navigators . equals ( "all" ) & & this . query . navigators . indexOf ( "namespace" , 0 ) < 0 ) return new ClusteredScoreMap < String > ( ) ;
if ( ! this . query . navigators . equals ( "all" ) & & this . query . navigators . indexOf ( "namespace" , 0 ) < 0 ) {
if ( this . namespaceNavigator . sizeSmaller ( 2 ) ) this . namespaceNavigator . clear ( ) ; // navigators with one entry are not useful
return new ClusteredScoreMap < String > ( ) ;
}
if ( this . namespaceNavigator . sizeSmaller ( 2 ) ) {
this . namespaceNavigator . clear ( ) ; // navigators with one entry are not useful
}
return this . namespaceNavigator ;
return this . namespaceNavigator ;
}
}
public ScoreMap < String > getHostNavigator ( ) {
public ScoreMap < String > getHostNavigator ( ) {
final ScoreMap < String > result = new ConcurrentScoreMap < String > ( ) ;
final ScoreMap < String > result = new ConcurrentScoreMap < String > ( ) ;
if ( ! this . query . navigators . equals ( "all" ) & & this . query . navigators . indexOf ( "hosts" , 0 ) < 0 ) return result ;
if ( ! this . query . navigators . equals ( "all" ) & & this . query . navigators . indexOf ( "hosts" , 0 ) < 0 ) {
return result ;
}
final Iterator < String > domhashs = this . hostNavigator . keys ( false ) ;
final Iterator < String > domhashs = this . hostNavigator . keys ( false ) ;
URIMetadataRow row ;
URIMetadataRow row ;
byte [ ] urlhash ;
byte [ ] urlhash ;
String hosthash , hostname ;
String hosthash , hostname ;
if ( this . hostResolver ! = null ) while ( domhashs . hasNext ( ) & & result . sizeSmaller ( 30 ) ) {
if ( this . hostResolver ! = null ) {
while ( domhashs . hasNext ( ) & & result . sizeSmaller ( 30 ) ) {
hosthash = domhashs . next ( ) ;
hosthash = domhashs . next ( ) ;
if ( hosthash = = null ) continue ;
if ( hosthash = = null ) {
continue ;
}
urlhash = this . hostResolver . get ( hosthash ) ;
urlhash = this . hostResolver . get ( hosthash ) ;
row = urlhash = = null ? null : this . query . getSegment ( ) . urlMetadata ( ) . load ( urlhash ) ;
row = urlhash = = null ? null : this . query . getSegment ( ) . urlMetadata ( ) . load ( urlhash ) ;
hostname = row = = null ? null : row . metadata ( ) . url ( ) . getHost ( ) ;
hostname = row = = null ? null : row . metadata ( ) . url ( ) . getHost ( ) ;
@ -639,26 +779,43 @@ public final class RWIProcess extends Thread {
result . set ( hostname , this . hostNavigator . get ( hosthash ) ) ;
result . set ( hostname , this . hostNavigator . get ( hosthash ) ) ;
}
}
}
}
if ( result . sizeSmaller ( 2 ) ) result . clear ( ) ; // navigators with one entry are not useful
}
if ( result . sizeSmaller ( 2 ) ) {
result . clear ( ) ; // navigators with one entry are not useful
}
return result ;
return result ;
}
}
public ScoreMap < String > getProtocolNavigator ( ) {
public ScoreMap < String > getProtocolNavigator ( ) {
if ( ! this . query . navigators . equals ( "all" ) & & this . query . navigators . indexOf ( "protocol" , 0 ) < 0 ) return new ClusteredScoreMap < String > ( ) ;
if ( ! this . query . navigators . equals ( "all" ) & & this . query . navigators . indexOf ( "protocol" , 0 ) < 0 ) {
if ( this . protocolNavigator . sizeSmaller ( 2 ) ) this . protocolNavigator . clear ( ) ; // navigators with one entry are not useful
return new ClusteredScoreMap < String > ( ) ;
}
if ( this . protocolNavigator . sizeSmaller ( 2 ) ) {
this . protocolNavigator . clear ( ) ; // navigators with one entry are not useful
}
return this . protocolNavigator ;
return this . protocolNavigator ;
}
}
public ScoreMap < String > getFiletypeNavigator ( ) {
public ScoreMap < String > getFiletypeNavigator ( ) {
if ( ! this . query . navigators . equals ( "all" ) & & this . query . navigators . indexOf ( "filetype" , 0 ) < 0 ) return new ClusteredScoreMap < String > ( ) ;
if ( ! this . query . navigators . equals ( "all" ) & & this . query . navigators . indexOf ( "filetype" , 0 ) < 0 ) {
if ( this . filetypeNavigator . sizeSmaller ( 2 ) ) this . filetypeNavigator . clear ( ) ; // navigators with one entry are not useful
return new ClusteredScoreMap < String > ( ) ;
}
if ( this . filetypeNavigator . sizeSmaller ( 2 ) ) {
this . filetypeNavigator . clear ( ) ; // navigators with one entry are not useful
}
return this . filetypeNavigator ;
return this . filetypeNavigator ;
}
}
public static final Comparator < Map . Entry < String , Integer > > mecomp = new Comparator < Map . Entry < String , Integer > > ( ) {
public static final Comparator < Map . Entry < String , Integer > > mecomp =
new Comparator < Map . Entry < String , Integer > > ( ) {
@Override
public int compare ( final Map . Entry < String , Integer > o1 , final Map . Entry < String , Integer > o2 ) {
public int compare ( final Map . Entry < String , Integer > o1 , final Map . Entry < String , Integer > o2 ) {
if ( o1 . getValue ( ) . intValue ( ) < o2 . getValue ( ) . intValue ( ) ) return 1 ;
if ( o1 . getValue ( ) . intValue ( ) < o2 . getValue ( ) . intValue ( ) ) {
if ( o2 . getValue ( ) . intValue ( ) < o1 . getValue ( ) . intValue ( ) ) return - 1 ;
return 1 ;
}
if ( o2 . getValue ( ) . intValue ( ) < o1 . getValue ( ) . intValue ( ) ) {
return - 1 ;
}
return 0 ;
return 0 ;
}
}
} ;
} ;
@ -667,8 +824,12 @@ public final class RWIProcess extends Thread {
// create a list of words that had been computed by statistics over all
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls
// words that appeared in the url or the description of all urls
final ScoreMap < String > result = new ConcurrentScoreMap < String > ( ) ;
final ScoreMap < String > result = new ConcurrentScoreMap < String > ( ) ;
if ( ! this . query . navigators . equals ( "all" ) & & this . query . navigators . indexOf ( "topics" , 0 ) < 0 ) return result ;
if ( ! this . query . navigators . equals ( "all" ) & & this . query . navigators . indexOf ( "topics" , 0 ) < 0 ) {
if ( this . ref . sizeSmaller ( 2 ) ) this . ref . clear ( ) ; // navigators with one entry are not useful
return result ;
}
if ( this . ref . sizeSmaller ( 2 ) ) {
this . ref . clear ( ) ; // navigators with one entry are not useful
}
final Map < String , Float > counts = new HashMap < String , Float > ( ) ;
final Map < String , Float > counts = new HashMap < String , Float > ( ) ;
final Iterator < String > i = this . ref . keys ( false ) ;
final Iterator < String > i = this . ref . keys ( false ) ;
String word ;
String word ;
@ -678,7 +839,9 @@ public final class RWIProcess extends Thread {
int ic = count ;
int ic = count ;
while ( ic - - > 0 & & i . hasNext ( ) ) {
while ( ic - - > 0 & & i . hasNext ( ) ) {
word = i . next ( ) ;
word = i . next ( ) ;
if ( word = = null ) continue ;
if ( word = = null ) {
continue ;
}
termHash = Word . word2hash ( word ) ;
termHash = Word . word2hash ( word ) ;
c = this . query . getSegment ( ) . termIndex ( ) . count ( termHash ) ;
c = this . query . getSegment ( ) . termIndex ( ) . count ( termHash ) ;
if ( c > 0 ) {
if ( c > 0 ) {
@ -688,9 +851,11 @@ public final class RWIProcess extends Thread {
counts . put ( word , q ) ;
counts . put ( word , q ) ;
}
}
}
}
if ( max > min ) for ( final Map . Entry < String , Float > ce : counts . entrySet ( ) ) {
if ( max > min ) {
for ( final Map . Entry < String , Float > ce : counts . entrySet ( ) ) {
result . set ( ce . getKey ( ) , ( int ) ( ( ( double ) count ) * ( ce . getValue ( ) - min ) / ( max - min ) ) ) ;
result . set ( ce . getKey ( ) , ( int ) ( ( ( double ) count ) * ( ce . getValue ( ) - min ) / ( max - min ) ) ) ;
}
}
}
return this . ref ;
return this . ref ;
}
}
@ -700,12 +865,13 @@ public final class RWIProcess extends Thread {
String word ;
String word ;
for ( final String w : words ) {
for ( final String w : words ) {
word = w . toLowerCase ( ) ;
word = w . toLowerCase ( ) ;
if ( word . length ( ) > 2 & &
if ( word . length ( ) > 2
"http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off" . indexOf ( word ) < 0 & &
& & "http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off"
! this . query . queryHashes . has ( Word . word2hash ( word ) ) & &
. indexOf ( word ) < 0
lettermatch . matcher ( word ) . matches ( ) & &
& & ! this . query . queryHashes . has ( Word . word2hash ( word ) )
! Switchboard . badwords . contains ( word ) & &
& & lettermatch . matcher ( word ) . matches ( )
! Switchboard . stopwords . contains ( word ) ) {
& & ! Switchboard . badwords . contains ( word )
& & ! Switchboard . stopwords . contains ( word ) ) {
this . ref . inc ( word ) ;
this . ref . inc ( word ) ;
}
}
}
}
@ -713,7 +879,9 @@ public final class RWIProcess extends Thread {
public void addTopics ( final ResultEntry resultEntry ) {
public void addTopics ( final ResultEntry resultEntry ) {
// take out relevant information for reference computation
// take out relevant information for reference computation
if ( ( resultEntry . url ( ) = = null ) | | ( resultEntry . title ( ) = = null ) ) return ;
if ( ( resultEntry . url ( ) = = null ) | | ( resultEntry . title ( ) = = null ) ) {
return ;
}
//final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
//final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
final String [ ] descrcomps = MultiProtocolURI . splitpattern . split ( resultEntry . title ( ) . toLowerCase ( ) ) ; // words in the description
final String [ ] descrcomps = MultiProtocolURI . splitpattern . split ( resultEntry . title ( ) . toLowerCase ( ) ) ; // words in the description
@ -725,8 +893,12 @@ public final class RWIProcess extends Thread {
public ScoreMap < String > getAuthorNavigator ( ) {
public ScoreMap < String > getAuthorNavigator ( ) {
// create a list of words that had been computed by statistics over all
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls
// words that appeared in the url or the description of all urls
if ( ! this . query . navigators . equals ( "all" ) & & this . query . navigators . indexOf ( "authors" , 0 ) < 0 ) return new ConcurrentScoreMap < String > ( ) ;
if ( ! this . query . navigators . equals ( "all" ) & & this . query . navigators . indexOf ( "authors" , 0 ) < 0 ) {
if ( this . authorNavigator . sizeSmaller ( 2 ) ) this . authorNavigator . clear ( ) ; // navigators with one entry are not useful
return new ConcurrentScoreMap < String > ( ) ;
}
if ( this . authorNavigator . sizeSmaller ( 2 ) ) {
this . authorNavigator . clear ( ) ; // navigators with one entry are not useful
}
return this . authorNavigator ;
return this . authorNavigator ;
}
}