@ -51,6 +51,7 @@ import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL ;
import net.yacy.cora.document.id.MultiProtocolURL ;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector ;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector ;
import net.yacy.cora.federate.solr.connector.SolrConnector ;
import net.yacy.cora.federate.solr.connector.SolrConnector ;
import net.yacy.cora.federate.solr.connector.SolrConnector.LoadTimeURL ;
import net.yacy.cora.federate.yacy.CacheStrategy ;
import net.yacy.cora.federate.yacy.CacheStrategy ;
import net.yacy.cora.order.Base64Order ;
import net.yacy.cora.order.Base64Order ;
import net.yacy.cora.order.ByteOrder ;
import net.yacy.cora.order.ByteOrder ;
@ -110,8 +111,9 @@ public class Segment {
public static final long targetFileSize = 64 * 1024 * 1024 ; // 256 MB
public static final long targetFileSize = 64 * 1024 * 1024 ; // 256 MB
public static final int writeBufferSize = 4 * 1024 * 1024 ;
public static final int writeBufferSize = 4 * 1024 * 1024 ;
public static final String termIndexName = "text.index" ;
public static final String termIndexName = "text.index" ;
public static final String citationIndexName = "citation.index" ;
public static final String citationIndexName = "citation.index" ;
public static final String firstseenIndexName = "firstseen.index" ;
public static final String firstseenIndexName = "firstseen.index" ;
public static final String loadtimeIndexName = "loadtime.index" ;
// the reference factory
// the reference factory
public static final ReferenceFactory < WordReference > wordReferenceFactory = new WordReferenceFactory ( ) ;
public static final ReferenceFactory < WordReference > wordReferenceFactory = new WordReferenceFactory ( ) ;
@ -122,9 +124,10 @@ public class Segment {
private final File segmentPath ;
private final File segmentPath ;
protected final Fulltext fulltext ;
protected final Fulltext fulltext ;
protected IndexCell < WordReference > termIndex ;
protected IndexCell < WordReference > termIndex ;
protected IndexCell < CitationReference > urlCitationIndex ;
private IndexCell < CitationReference > urlCitationIndex ;
protected IndexTable firstSeenIndex ;
private IndexTable firstSeenIndex ;
protected IODispatcher merger = null ; // shared iodispatcher for kelondro indexes
private IndexTable loadTimeIndex ;
private IODispatcher merger = null ; // shared iodispatcher for kelondro indexes
/ * *
/ * *
* create a new Segment
* create a new Segment
@ -143,6 +146,7 @@ public class Segment {
this . termIndex = null ;
this . termIndex = null ;
this . urlCitationIndex = null ;
this . urlCitationIndex = null ;
this . firstSeenIndex = new IndexTable ( new File ( segmentPath , firstseenIndexName ) , 12 , 8 , false , false ) ;
this . firstSeenIndex = new IndexTable ( new File ( segmentPath , firstseenIndexName ) , 12 , 8 , false , false ) ;
this . loadTimeIndex = new IndexTable ( new File ( segmentPath , loadtimeIndexName ) , 12 , 8 , false , false ) ;
}
}
public boolean connectedRWI ( ) {
public boolean connectedRWI ( ) {
@ -166,7 +170,7 @@ public class Segment {
targetFileSize ,
targetFileSize ,
maxFileSize ,
maxFileSize ,
writeBufferSize ,
writeBufferSize ,
merger) ;
this . merger) ;
}
}
public void disconnectRWI ( ) {
public void disconnectRWI ( ) {
@ -196,7 +200,7 @@ public class Segment {
targetFileSize ,
targetFileSize ,
maxFileSize ,
maxFileSize ,
writeBufferSize ,
writeBufferSize ,
merger) ;
this . merger) ;
}
}
public void disconnectCitation ( ) {
public void disconnectCitation ( ) {
@ -225,10 +229,14 @@ public class Segment {
return this . urlCitationIndex ;
return this . urlCitationIndex ;
}
}
public IndexTable firstSeen ( ) {
public IndexTable firstSeen Index ( ) {
return this . firstSeenIndex ;
return this . firstSeenIndex ;
}
}
public IndexTable loadTimeIndex ( ) {
return this . loadTimeIndex ;
}
public ReferenceReportCache getReferenceReportCache ( ) {
public ReferenceReportCache getReferenceReportCache ( ) {
return new ReferenceReportCache ( ) ;
return new ReferenceReportCache ( ) ;
}
}
@ -239,12 +247,12 @@ public class Segment {
this . cache = new ConcurrentHashMap < String , ReferenceReport > ( ) ;
this . cache = new ConcurrentHashMap < String , ReferenceReport > ( ) ;
}
}
public ReferenceReport getReferenceReport ( final String id , final boolean acceptSelfReference ) throws IOException {
public ReferenceReport getReferenceReport ( final String id , final boolean acceptSelfReference ) throws IOException {
ReferenceReport rr = cache. get ( id ) ;
ReferenceReport rr = this . cache. get ( id ) ;
if ( MemoryControl . shortStatus ( ) ) cache. clear ( ) ;
if ( MemoryControl . shortStatus ( ) ) this . cache. clear ( ) ;
if ( rr ! = null ) return rr ;
if ( rr ! = null ) return rr ;
try {
try {
rr = new ReferenceReport ( ASCII . getBytes ( id ) , acceptSelfReference ) ;
rr = new ReferenceReport ( ASCII . getBytes ( id ) , acceptSelfReference ) ;
cache. put ( id , rr ) ;
this . cache. put ( id , rr ) ;
return rr ;
return rr ;
} catch ( final SpaceExceededException e ) {
} catch ( final SpaceExceededException e ) {
ConcurrentLog . logException ( e ) ;
ConcurrentLog . logException ( e ) ;
@ -278,19 +286,19 @@ public class Segment {
CitationReference ref = ri . next ( ) ;
CitationReference ref = ri . next ( ) ;
byte [ ] hh = ref . hosthash ( ) ; // host hash
byte [ ] hh = ref . hosthash ( ) ; // host hash
if ( ByteBuffer . equals ( hh , 0 , id , 6 , 6 ) ) {
if ( ByteBuffer . equals ( hh , 0 , id , 6 , 6 ) ) {
internalIDs. put ( ref . urlhash ( ) ) ;
this . internalIDs. put ( ref . urlhash ( ) ) ;
internal+ + ;
this . internal+ + ;
} else {
} else {
externalHosts. put ( hh ) ;
this . externalHosts. put ( hh ) ;
externalIDs. put ( ref . urlhash ( ) ) ;
this . externalIDs. put ( ref . urlhash ( ) ) ;
external+ + ;
this . external+ + ;
}
}
}
}
} catch ( SpaceExceededException e ) {
} catch ( SpaceExceededException e ) {
// the Citation Index got too large, we ignore the problem and hope that a second solr index is attached which will take over now
// the Citation Index got too large, we ignore the problem and hope that a second solr index is attached which will take over now
if ( Segment . this . fulltext . useWebgraph ( ) ) internalIDs. clear ( ) ;
if ( Segment . this . fulltext . useWebgraph ( ) ) this . internalIDs. clear ( ) ;
}
}
if ( ( internalIDs. size ( ) = = 0 | | ! connectedCitation ( ) ) & & Segment . this . fulltext . useWebgraph ( ) ) {
if ( ( this . internalIDs. size ( ) = = 0 | | ! connectedCitation ( ) ) & & Segment . this . fulltext . useWebgraph ( ) ) {
// reqd the references from the webgraph
// reqd the references from the webgraph
SolrConnector webgraph = Segment . this . fulltext . getWebgraphConnector ( ) ;
SolrConnector webgraph = Segment . this . fulltext . getWebgraphConnector ( ) ;
BlockingQueue < SolrDocument > docs = webgraph . concurrentDocumentsByQuery ( "{!cache=false raw f=" + WebgraphSchema . target_id_s . getSolrFieldName ( ) + "}" + ASCII . String ( id ) , WebgraphSchema . source_chars_i . getSolrFieldName ( ) + " asc" , 0 , 10000000 , Long . MAX_VALUE , 100 , 1 , false , WebgraphSchema . source_id_s . getSolrFieldName ( ) ) ;
BlockingQueue < SolrDocument > docs = webgraph . concurrentDocumentsByQuery ( "{!cache=false raw f=" + WebgraphSchema . target_id_s . getSolrFieldName ( ) + "}" + ASCII . String ( id ) , WebgraphSchema . source_chars_i . getSolrFieldName ( ) + " asc" , 0 , 10000000 , Long . MAX_VALUE , 100 , 1 , false , WebgraphSchema . source_id_s . getSolrFieldName ( ) ) ;
@ -305,13 +313,13 @@ public class Segment {
System . arraycopy ( refidh , 6 , hh , 0 , 6 ) ;
System . arraycopy ( refidh , 6 , hh , 0 , 6 ) ;
if ( ByteBuffer . equals ( hh , 0 , id , 6 , 6 ) ) {
if ( ByteBuffer . equals ( hh , 0 , id , 6 , 6 ) ) {
if ( acceptSelfReference | | ! Arrays . equals ( refidh , id ) ) {
if ( acceptSelfReference | | ! Arrays . equals ( refidh , id ) ) {
internalIDs. put ( refidh ) ;
this . internalIDs. put ( refidh ) ;
internal+ + ;
this . internal+ + ;
}
}
} else {
} else {
externalHosts. put ( hh ) ;
this . externalHosts. put ( hh ) ;
externalIDs. put ( refidh ) ;
this . externalIDs. put ( refidh ) ;
external+ + ;
this . external+ + ;
}
}
}
}
} catch ( final InterruptedException e ) {
} catch ( final InterruptedException e ) {
@ -398,6 +406,35 @@ public class Segment {
}
}
}
}
public void setLoadTime ( final byte [ ] urlhash , long time ) {
if ( urlhash = = null | | time < = 0 ) return ;
try {
this . loadTimeIndex . put ( urlhash , time ) ; // ALWAYS overwrite!
} catch ( IOException e ) {
ConcurrentLog . logException ( e ) ;
}
}
public long getLoadTime ( final byte [ ] urlhash ) {
if ( urlhash = = null ) return - 1 ;
try {
return this . loadTimeIndex . get ( urlhash ) ;
} catch ( IOException e ) {
ConcurrentLog . logException ( e ) ;
return - 1 ;
}
}
public LoadTimeURL getLoadTimeURL ( String url , byte [ ] urlhash ) {
long t = getLoadTime ( urlhash ) ;
if ( t < 0 ) return null ;
return new LoadTimeURL ( url , t ) ;
}
public LoadTimeURL getLoadTimeURL ( String url , String id ) {
return getLoadTimeURL ( url , id . getBytes ( ) ) ;
}
/ * *
/ * *
* check if a given document , identified by url hash as document id exists
* check if a given document , identified by url hash as document id exists
* @param id the url hash and document id
* @param id the url hash and document id
@ -483,6 +520,7 @@ public class Segment {
if ( this . fulltext ! = null ) this . fulltext . close ( ) ;
if ( this . fulltext ! = null ) this . fulltext . close ( ) ;
if ( this . urlCitationIndex ! = null ) this . urlCitationIndex . close ( ) ;
if ( this . urlCitationIndex ! = null ) this . urlCitationIndex . close ( ) ;
if ( this . firstSeenIndex ! = null ) this . firstSeenIndex . close ( ) ;
if ( this . firstSeenIndex ! = null ) this . firstSeenIndex . close ( ) ;
if ( this . loadTimeIndex ! = null ) this . loadTimeIndex . close ( ) ;
if ( this . merger ! = null ) {
if ( this . merger ! = null ) {
this . merger . terminate ( ) ;
this . merger . terminate ( ) ;
this . merger = null ;
this . merger = null ;
@ -661,7 +699,9 @@ public class Segment {
}
}
// REMEMBER FIRST SEEN
// REMEMBER FIRST SEEN
setFirstSeenTime ( url . hash ( ) , Math . min ( document . getLastModified ( ) . getTime ( ) , System . currentTimeMillis ( ) ) ) ; // should exist already in the index at this time, but just to make sure
long now = System . currentTimeMillis ( ) ;
setFirstSeenTime ( url . hash ( ) , Math . min ( document . getLastModified ( ) . getTime ( ) , now ) ) ; // should exist already in the index at this time, but just to make sure
setLoadTime ( url . hash ( ) , now ) ; // always overwrites index entry
// write the edges to the citation reference index
// write the edges to the citation reference index
if ( this . connectedCitation ( ) ) try {
if ( this . connectedCitation ( ) ) try {
@ -676,7 +716,7 @@ public class Segment {
String referrerhash = id ;
String referrerhash = id ;
String anchorhash = ASCII . String ( new DigestURL ( targetURL ) . hash ( ) ) ;
String anchorhash = ASCII . String ( new DigestURL ( targetURL ) . hash ( ) ) ;
if ( referrerhash ! = null & & anchorhash ! = null ) {
if ( referrerhash ! = null & & anchorhash ! = null ) {
urlCitationIndex. add ( ASCII . getBytes ( anchorhash ) , new CitationReference ( ASCII . getBytes ( referrerhash ) , loadDate . getTime ( ) ) ) ;
this . urlCitationIndex. add ( ASCII . getBytes ( anchorhash ) , new CitationReference ( ASCII . getBytes ( referrerhash ) , loadDate . getTime ( ) ) ) ;
}
}
} catch ( Throwable e ) {
} catch ( Throwable e ) {
ConcurrentLog . logException ( e ) ;
ConcurrentLog . logException ( e ) ;
@ -692,7 +732,7 @@ public class Segment {
String referrerhash = id ;
String referrerhash = id ;
String anchorhash = ASCII . String ( new DigestURL ( targetURL ) . hash ( ) ) ;
String anchorhash = ASCII . String ( new DigestURL ( targetURL ) . hash ( ) ) ;
if ( referrerhash ! = null & & anchorhash ! = null ) {
if ( referrerhash ! = null & & anchorhash ! = null ) {
urlCitationIndex. add ( ASCII . getBytes ( anchorhash ) , new CitationReference ( ASCII . getBytes ( referrerhash ) , loadDate . getTime ( ) ) ) ;
this . urlCitationIndex. add ( ASCII . getBytes ( anchorhash ) , new CitationReference ( ASCII . getBytes ( referrerhash ) , loadDate . getTime ( ) ) ) ;
}
}
} catch ( Throwable e ) {
} catch ( Throwable e ) {
ConcurrentLog . logException ( e ) ;
ConcurrentLog . logException ( e ) ;