// IndexControlRWIs_p.java
// -----------------------
// (C) 2004-2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 2004 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2007-11-14 01:15:28 +0000 (Mi, 14 Nov 2007) $
// $LastChangedRevision: 4216 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.File ;
import java.io.FileWriter ;
import java.io.IOException ;
import java.io.PrintWriter ;
import java.util.HashMap ;
import java.util.HashSet ;
import java.util.Iterator ;
import java.util.Set ;
import de.anomic.data.AbstractBlacklist ;
import de.anomic.data.listManager ;
import de.anomic.http.httpRequestHeader ;
import de.anomic.kelondro.order.Bitfield ;
import de.anomic.kelondro.text.Reference ;
import de.anomic.kelondro.text.ReferenceContainer ;
import de.anomic.kelondro.text.ReferenceContainerCache ;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow ;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow ;
import de.anomic.plasma.plasmaSearchAPI ;
import de.anomic.plasma.plasmaSearchEvent ;
import de.anomic.plasma.plasmaSearchRankingProcess ;
import de.anomic.plasma.plasmaSwitchboard ;
import de.anomic.plasma.plasmaWordIndex ;
import de.anomic.plasma.parser.Word ;
import de.anomic.server.serverObjects ;
import de.anomic.server.serverSwitch ;
import de.anomic.yacy.yacyClient ;
import de.anomic.yacy.yacySeed ;
import de.anomic.yacy.yacyURL ;
public class IndexControlRWIs_p {
public static serverObjects respond ( final httpRequestHeader header , final serverObjects post , final serverSwitch < ? > env ) {
// return variable that accumulates replacements
final plasmaSwitchboard sb = ( plasmaSwitchboard ) env ;
final serverObjects prop = new serverObjects ( ) ;
prop . putHTML ( "keystring" , "" ) ;
prop . put ( "keyhash" , "" ) ;
prop . put ( "result" , "" ) ;
// switch off all optional forms/lists
prop . put ( "searchresult" , 0 ) ;
prop . put ( "keyhashsimilar" , 0 ) ;
prop . put ( "genUrlList" , 0 ) ;
// clean up all search events
plasmaSearchEvent . cleanupEvents ( true ) ;
if ( post ! = null ) {
// default values
final String keystring = post . get ( "keystring" , "" ) . trim ( ) ;
String keyhash = post . get ( "keyhash" , "" ) . trim ( ) ;
prop . putHTML ( "keystring" , keystring ) ;
prop . putHTML ( "keyhash" , keyhash ) ;
// read values from checkboxes
String [ ] urlx = post . getAll ( "urlhx.*" ) ;
final boolean delurl = post . containsKey ( "delurl" ) ;
final boolean delurlref = post . containsKey ( "delurlref" ) ;
if ( post . containsKey ( "keystringsearch" ) ) {
keyhash = Word . word2hash ( keystring ) ;
prop . put ( "keyhash" , keyhash ) ;
final plasmaSearchRankingProcess ranking = plasmaSearchAPI . genSearchresult ( prop , sb , keyhash , null ) ;
if ( ranking . filteredCount ( ) = = 0 ) {
prop . put ( "searchresult" , 1 ) ;
prop . putHTML ( "searchresult_word" , keystring ) ;
}
}
if ( post . containsKey ( "keyhashsearch" ) ) {
if ( keystring . length ( ) = = 0 | | ! Word . word2hash ( keystring ) . equals ( keyhash ) ) {
prop . put ( "keystring" , "<not possible to compute word from hash>" ) ;
}
final plasmaSearchRankingProcess ranking = plasmaSearchAPI . genSearchresult ( prop , sb , keyhash , null ) ;
if ( ranking . filteredCount ( ) = = 0 ) {
prop . put ( "searchresult" , 2 ) ;
prop . putHTML ( "searchresult_wordhash" , keyhash ) ;
}
}
// delete everything
if ( post . containsKey ( "deletecomplete" ) & & post . containsKey ( "confirmDelete" ) ) {
sb . webIndex . clear ( ) ;
sb . crawlQueues . clear ( ) ;
sb . crawlStacker . clear ( ) ;
try {
sb . robots . clear ( ) ;
} catch ( final IOException e ) {
e . printStackTrace ( ) ;
}
post . remove ( "deletecomplete" ) ;
}
// delete word
if ( post . containsKey ( "keyhashdeleteall" ) ) try {
if ( delurl | | delurlref ) {
// generate an urlx array
ReferenceContainer index = null ;
index = sb . webIndex . index ( ) . get ( keyhash , null ) ;
final Iterator < WordReferenceRow > en = index . entries ( ) ;
int i = 0 ;
urlx = new String [ index . size ( ) ] ;
while ( en . hasNext ( ) ) {
urlx [ i + + ] = en . next ( ) . metadataHash ( ) ;
}
index = null ;
}
if ( delurlref ) {
for ( int i = 0 ; i < urlx . length ; i + + ) sb . removeAllUrlReferences ( urlx [ i ] , true ) ;
}
if ( delurl | | delurlref ) {
for ( int i = 0 ; i < urlx . length ; i + + ) {
sb . urlRemove ( urlx [ i ] ) ;
}
}
sb . webIndex . index ( ) . delete ( keyhash ) ;
post . remove ( "keyhashdeleteall" ) ;
post . put ( "urllist" , "generated" ) ;
} catch ( IOException e ) {
e . printStackTrace ( ) ;
}
// delete selected URLs
if ( post . containsKey ( "keyhashdelete" ) ) try {
if ( delurlref ) {
for ( int i = 0 ; i < urlx . length ; i + + ) sb . removeAllUrlReferences ( urlx [ i ] , true ) ;
}
if ( delurl | | delurlref ) {
for ( int i = 0 ; i < urlx . length ; i + + ) {
sb . urlRemove ( urlx [ i ] ) ;
}
}
final Set < String > urlHashes = new HashSet < String > ( ) ;
for ( int i = 0 ; i < urlx . length ; i + + ) urlHashes . add ( urlx [ i ] ) ;
sb . webIndex . index ( ) . remove ( keyhash , urlHashes ) ;
// this shall lead to a presentation of the list; so handle that the remaining program
// thinks that it was called for a list presentation
post . remove ( "keyhashdelete" ) ;
post . put ( "urllist" , "generated" ) ;
} catch ( IOException e ) {
e . printStackTrace ( ) ;
}
if ( post . containsKey ( "urllist" ) ) {
if ( keystring . length ( ) = = 0 | | ! Word . word2hash ( keystring ) . equals ( keyhash ) ) {
prop . put ( "keystring" , "<not possible to compute word from hash>" ) ;
}
final Bitfield flags = plasmaSearchAPI . compileFlags ( post ) ;
final int count = ( post . get ( "lines" , "all" ) . equals ( "all" ) ) ? - 1 : post . getInt ( "lines" , - 1 ) ;
final plasmaSearchRankingProcess ranking = plasmaSearchAPI . genSearchresult ( prop , sb , keyhash , flags ) ;
plasmaSearchAPI . genURLList ( prop , keyhash , keystring , ranking , flags , count ) ;
}
// transfer to other peer
if ( post . containsKey ( "keyhashtransfer" ) ) try {
if ( keystring . length ( ) = = 0 | | ! Word . word2hash ( keystring ) . equals ( keyhash ) ) {
prop . put ( "keystring" , "<not possible to compute word from hash>" ) ;
}
// find host & peer
String host = post . get ( "host" , "" ) ; // get host from input field
yacySeed seed = null ;
if ( host . length ( ) ! = 0 ) {
if ( host . length ( ) = = 12 ) {
// the host string is a peer hash
seed = sb . webIndex . peers ( ) . getConnected ( host ) ;
} else {
// the host string can be a host name
seed = sb . webIndex . peers ( ) . lookupByName ( host ) ;
}
} else {
host = post . get ( "hostHash" , "" ) ; // if input field is empty, get from select box
seed = sb . webIndex . peers ( ) . getConnected ( host ) ;
}
// prepare index
ReferenceContainer index ;
final long starttime = System . currentTimeMillis ( ) ;
index = sb . webIndex . index ( ) . get ( keyhash , null ) ;
// built urlCache
final Iterator < WordReferenceRow > urlIter = index . entries ( ) ;
final HashMap < String , URLMetadataRow > knownURLs = new HashMap < String , URLMetadataRow > ( ) ;
final HashSet < String > unknownURLEntries = new HashSet < String > ( ) ;
Reference iEntry ;
URLMetadataRow lurl ;
while ( urlIter . hasNext ( ) ) {
iEntry = urlIter . next ( ) ;
lurl = sb . webIndex . metadata ( ) . load ( iEntry . metadataHash ( ) , null , 0 ) ;
if ( lurl = = null ) {
unknownURLEntries . add ( iEntry . metadataHash ( ) ) ;
urlIter . remove ( ) ;
} else {
knownURLs . put ( iEntry . metadataHash ( ) , lurl ) ;
}
}
replaced old DHT transmission method with new method. Many things have changed! some of them:
- after a index selection is made, the index is splitted into its vertical components
- from differrent index selctions the splitted components can be accumulated before they are placed into the transmission queue
- each splitted chunk gets its own transmission thread
- multiple transmission threads are started concurrently
- the process can be monitored with the blocking queue servlet
To implement that, a new package de.anomic.yacy.dht was created. Some old files have been removed.
The new index distribution model using a vertical DHT was implemented. An abstraction of this model
is implemented in the new dht package as interface. The freeworld network has now a configuration
of two vertial partitions; sixteen partitions are planned and will be configured if the process is bug-free.
This modification has three main targets:
- enhance the DHT transmission speed
- with a vertical DHT, a search will speed up. With two partitions, two times. With sixteen, sixteen times.
- the vertical DHT will apply a semi-dht for URLs, and peers will receive a fraction of the overall URLs they received before.
with two partitions, the fractions will be halve. With sixteen partitions, a 1/16 of the previous number of URLs.
BE CAREFULL, THIS IS A MAJOR CODE CHANGE, POSSIBLY FULL OF BUGS AND HARMFUL THINGS.
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5586 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago
// make an indexContainerCache
ReferenceContainerCache icc = new ReferenceContainerCache ( index . rowdef , plasmaWordIndex . wordOrder ) ;
icc . add ( index ) ;
replaced old DHT transmission method with new method. Many things have changed! some of them:
- after a index selection is made, the index is splitted into its vertical components
- from differrent index selctions the splitted components can be accumulated before they are placed into the transmission queue
- each splitted chunk gets its own transmission thread
- multiple transmission threads are started concurrently
- the process can be monitored with the blocking queue servlet
To implement that, a new package de.anomic.yacy.dht was created. Some old files have been removed.
The new index distribution model using a vertical DHT was implemented. An abstraction of this model
is implemented in the new dht package as interface. The freeworld network has now a configuration
of two vertial partitions; sixteen partitions are planned and will be configured if the process is bug-free.
This modification has three main targets:
- enhance the DHT transmission speed
- with a vertical DHT, a search will speed up. With two partitions, two times. With sixteen, sixteen times.
- the vertical DHT will apply a semi-dht for URLs, and peers will receive a fraction of the overall URLs they received before.
with two partitions, the fractions will be halve. With sixteen partitions, a 1/16 of the previous number of URLs.
BE CAREFULL, THIS IS A MAJOR CODE CHANGE, POSSIBLY FULL OF BUGS AND HARMFUL THINGS.
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5586 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago
// transport to other peer
final String gzipBody = sb . getConfig ( "indexControl.gzipBody" , "false" ) ;
final int timeout = ( int ) sb . getConfigLong ( "indexControl.timeout" , 60000 ) ;
final String error = yacyClient . transferIndex (
seed ,
replaced old DHT transmission method with new method. Many things have changed! some of them:
- after a index selection is made, the index is splitted into its vertical components
- from differrent index selctions the splitted components can be accumulated before they are placed into the transmission queue
- each splitted chunk gets its own transmission thread
- multiple transmission threads are started concurrently
- the process can be monitored with the blocking queue servlet
To implement that, a new package de.anomic.yacy.dht was created. Some old files have been removed.
The new index distribution model using a vertical DHT was implemented. An abstraction of this model
is implemented in the new dht package as interface. The freeworld network has now a configuration
of two vertial partitions; sixteen partitions are planned and will be configured if the process is bug-free.
This modification has three main targets:
- enhance the DHT transmission speed
- with a vertical DHT, a search will speed up. With two partitions, two times. With sixteen, sixteen times.
- the vertical DHT will apply a semi-dht for URLs, and peers will receive a fraction of the overall URLs they received before.
with two partitions, the fractions will be halve. With sixteen partitions, a 1/16 of the previous number of URLs.
BE CAREFULL, THIS IS A MAJOR CODE CHANGE, POSSIBLY FULL OF BUGS AND HARMFUL THINGS.
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5586 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago
icc ,
knownURLs ,
"true" . equalsIgnoreCase ( gzipBody ) ,
timeout ) ;
prop . put ( "result" , ( error = = null ) ? ( "Successfully transferred " + knownURLs . size ( ) + " words in " + ( ( System . currentTimeMillis ( ) - starttime ) / 1000 ) + " seconds, " + unknownURLEntries + " URL not found" ) : "error: " + error ) ;
index = null ;
} catch ( IOException e ) {
e . printStackTrace ( ) ;
}
// generate list
if ( post . containsKey ( "keyhashsimilar" ) ) try {
final Iterator < ReferenceContainer > containerIt = sb . webIndex . index ( ) . references ( keyhash , true , 256 , false ) . iterator ( ) ;
ReferenceContainer container ;
int i = 0 ;
int rows = 0 , cols = 0 ;
prop . put ( "keyhashsimilar" , "1" ) ;
while ( containerIt . hasNext ( ) & & i < 256 ) {
container = containerIt . next ( ) ;
prop . put ( "keyhashsimilar_rows_" + rows + "_cols_" + cols + "_wordHash" , container . getTermHash ( ) ) ;
cols + + ;
if ( cols = = 8 ) {
prop . put ( "keyhashsimilar_rows_" + rows + "_cols" , cols ) ;
cols = 0 ;
rows + + ;
}
i + + ;
}
prop . put ( "keyhashsimilar_rows_" + rows + "_cols" , cols ) ;
prop . put ( "keyhashsimilar_rows" , rows + 1 ) ;
prop . put ( "result" , "" ) ;
} catch ( IOException e ) {
e . printStackTrace ( ) ;
}
if ( post . containsKey ( "blacklist" ) ) {
final String blacklist = post . get ( "blacklist" , "" ) ;
final Set < String > urlHashes = new HashSet < String > ( ) ;
if ( post . containsKey ( "blacklisturls" ) ) {
PrintWriter pw ;
try {
final String [ ] supportedBlacklistTypes = env . getConfig ( "BlackLists.types" , "" ) . split ( "," ) ;
pw = new PrintWriter ( new FileWriter ( new File ( listManager . listsPath , blacklist ) , true ) ) ;
yacyURL url ;
for ( int i = 0 ; i < urlx . length ; i + + ) {
urlHashes . add ( urlx [ i ] ) ;
final URLMetadataRow e = sb . webIndex . metadata ( ) . load ( urlx [ i ] , null , 0 ) ;
sb . webIndex . metadata ( ) . remove ( urlx [ i ] ) ;
if ( e ! = null ) {
url = e . metadata ( ) . url ( ) ;
pw . println ( url . getHost ( ) + "/" + url . getFile ( ) ) ;
for ( int blTypes = 0 ; blTypes < supportedBlacklistTypes . length ; blTypes + + ) {
if ( listManager . listSetContains ( supportedBlacklistTypes [ blTypes ] + ".BlackLists" , blacklist ) ) {
plasmaSwitchboard . urlBlacklist . add (
supportedBlacklistTypes [ blTypes ] ,
url . getHost ( ) ,
url . getFile ( ) ) ;
}
}
}
}
pw . close ( ) ;
} catch ( final IOException e ) {
}
}
if ( post . containsKey ( "blacklistdomains" ) ) {
PrintWriter pw ;
try {
final String [ ] supportedBlacklistTypes = AbstractBlacklist . BLACKLIST_TYPES_STRING . split ( "," ) ;
pw = new PrintWriter ( new FileWriter ( new File ( listManager . listsPath , blacklist ) , true ) ) ;
yacyURL url ;
for ( int i = 0 ; i < urlx . length ; i + + ) {
urlHashes . add ( urlx [ i ] ) ;
final URLMetadataRow e = sb . webIndex . metadata ( ) . load ( urlx [ i ] , null , 0 ) ;
sb . webIndex . metadata ( ) . remove ( urlx [ i ] ) ;
if ( e ! = null ) {
url = e . metadata ( ) . url ( ) ;
pw . println ( url . getHost ( ) + "/.*" ) ;
for ( int blTypes = 0 ; blTypes < supportedBlacklistTypes . length ; blTypes + + ) {
if ( listManager . listSetContains ( supportedBlacklistTypes [ blTypes ] + ".BlackLists" , blacklist ) ) {
plasmaSwitchboard . urlBlacklist . add (
supportedBlacklistTypes [ blTypes ] ,
url . getHost ( ) , ".*" ) ;
}
}
}
}
pw . close ( ) ;
} catch ( final IOException e ) {
}
}
try {
sb . webIndex . index ( ) . remove ( keyhash , urlHashes ) ;
} catch ( IOException e ) {
e . printStackTrace ( ) ;
}
}
if ( prop . getInt ( "searchresult" , 0 ) = = 3 ) plasmaSearchAPI . listHosts ( prop , keyhash , sb ) ;
}
// insert constants
prop . putNum ( "wcount" , sb . webIndex . index ( ) . size ( ) ) ;
// return rewrite properties
return prop ;
}
}