@ -1,4 +1,4 @@
// getpageinfo _p
// getpageinfo
// (C) 2011 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 11.11.2011 on http://yacy.net
//
@ -24,229 +24,46 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.IOException ;
import java.net.MalformedURLException ;
import java.util.ArrayList ;
import java.util.Collection ;
import java.util.List ;
import java.util. Set ;
import java.util.Map.Entry ;
import javax.xml.parsers.DocumentBuilder ;
import javax.xml.parsers.DocumentBuilderFactory ;
import javax.xml.parsers.ParserConfigurationException ;
import net.yacy.cora.document.id.AnchorURL ;
import net.yacy.cora.document.id.DigestURL ;
import net.yacy.cora.federate.yacy.CacheStrategy ;
import net.yacy.cora.protocol.ClientIdentification ;
import net.yacy.cora.protocol.RequestHeader ;
import net.yacy.cora.util.ConcurrentLog ;
import net.yacy.crawler.robots.RobotsTxtEntry ;
import net.yacy.repository.Blacklist.BlacklistType ;
import net.yacy.search.Switchboard ;
import net.yacy.server.serverObjects ;
import net.yacy.server.serverSwitch ;
import org.w3c.dom.Document ;
import org.w3c.dom.Node ;
import org.w3c.dom.NodeList ;
import org.xml.sax.SAXException ;
/ * *
* @deprecated use now { @link getpageinfo_p }
* /
@Deprecated
public class getpageinfo {
public static serverObjects respond ( @SuppressWarnings ( "unused" ) final RequestHeader header , final serverObjects post , final serverSwitch env ) {
final Switchboard sb = ( Switchboard ) env ;
@SuppressWarnings ( "unused" )
public static serverObjects respond ( final RequestHeader header , final serverObjects post , final serverSwitch env ) {
final serverObjects prop = new serverObjects ( ) ;
// avoid UNRESOLVED PATTERN
prop . put ( "title" , "" ) ;
prop . put ( "desc" , "" ) ;
prop . put ( "lang" , "" ) ;
prop . put ( "robots-allowed" , "3" ) ; //unknown
prop . put ( "robotsInfo" , "" ) ; //unknown
prop . put ( "icons" , "0" ) ;
prop . put ( "sitelist" , "" ) ;
prop . put ( "filter" , ".*" ) ;
prop . put ( "oai" , 0 ) ;
// default actions
String actions = "title,robots" ;
if ( post ! = null & & post . containsKey ( "url" ) ) {
if ( post . containsKey ( "actions" ) )
actions = post . get ( "actions" ) ;
String url = post . get ( "url" ) ;
String agentName = post . get ( "agentName" , ClientIdentification . yacyInternetCrawlerAgentName ) ;
ClientIdentification . Agent agent = ClientIdentification . getAgent ( agentName ) ;
if ( url . toLowerCase ( ) . startsWith ( "ftp://" ) ) {
prop . put ( "robots-allowed" , "1" ) ; // ok to crawl
prop . put ( "robotsInfo" , "ftp does not follow robots.txt" ) ;
prop . putXML ( "title" , "FTP: " + url ) ;
return prop ;
} else if ( ! url . startsWith ( "http://" ) & &
! url . startsWith ( "https://" ) & &
! url . startsWith ( "ftp://" ) & &
! url . startsWith ( "smb://" ) & &
! url . startsWith ( "file://" ) ) {
url = "http://" + url ;
}
if ( actions . indexOf ( "title" , 0 ) > = 0 ) {
DigestURL u = null ;
try {
u = new DigestURL ( url ) ;
} catch ( final MalformedURLException e ) {
ConcurrentLog . logException ( e ) ;
}
net . yacy . document . Document scraper = null ;
if ( u ! = null ) try {
scraper = sb . loader . loadDocument ( u , CacheStrategy . IFEXIST , BlacklistType . CRAWLER , agent ) ;
} catch ( final IOException e ) {
ConcurrentLog . logException ( e ) ;
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
// that should not affect the robots.txt validity
}
if ( scraper ! = null ) {
// put the document title
prop . putXML ( "title" , removelinebreaks ( scraper . dc_title ( ) ) ) ;
Set < DigestURL > iconURLs = scraper . getIcons ( ) . keySet ( ) ;
int i = 0 ;
for ( DigestURL iconURL : iconURLs ) {
prop . putXML ( "icons_" + i + "_icon" , iconURL . toNormalform ( false ) ) ;
prop . put ( "icons_" + i + "_eol" , 1 ) ;
i + + ;
}
prop . put ( "icons_" + ( i - 1 ) + "_eol" , 0 ) ;
prop . put ( "icons" , iconURLs . size ( ) ) ;
// put keywords
final Set < String > list = scraper . dc_subject ( ) ;
int count = 0 ;
for ( final String element : list ) {
if ( ! element . equals ( "" ) ) {
prop . putXML ( "tags_" + count + "_tag" , element ) ;
count + + ;
}
}
prop . put ( "tags" , count ) ;
// put description
prop . putXML ( "desc" , removelinebreaks ( scraper . dc_description ( ) . length > 0 ? scraper . dc_description ( ) [ 0 ] : "" ) ) ;
// put language
final Set < String > languages = scraper . getContentLanguages ( ) ;
prop . putXML ( "lang" , ( languages = = null | | languages . size ( ) = = 0 ) ? "unknown" : languages . iterator ( ) . next ( ) ) ;
// get links and put them into a semicolon-separated list
final Collection < AnchorURL > uris = scraper . getAnchors ( ) ;
final StringBuilder links = new StringBuilder ( uris . size ( ) * 80 ) ;
final StringBuilder filter = new StringBuilder ( uris . size ( ) * 40 ) ;
count = 0 ;
for ( final DigestURL uri : uris ) {
if ( uri = = null ) continue ;
links . append ( ';' ) . append ( uri . toNormalform ( true ) ) ;
filter . append ( '|' ) . append ( uri . getProtocol ( ) ) . append ( "://" ) . append ( uri . getHost ( ) ) . append ( ".*" ) ;
prop . putXML ( "links_" + count + "_link" , uri . toNormalform ( true ) ) ;
count + + ;
}
prop . put ( "links" , count ) ;
prop . putXML ( "sitelist" , links . length ( ) > 0 ? links . substring ( 1 ) : "" ) ;
prop . putXML ( "filter" , filter . length ( ) > 0 ? filter . substring ( 1 ) : ".*" ) ;
}
}
if ( actions . indexOf ( "robots" , 0 ) > = 0 ) {
try {
final DigestURL theURL = new DigestURL ( url ) ;
// determine if crawling of the current URL is allowed
RobotsTxtEntry robotsEntry = sb . robots . getEntry ( theURL , agent ) ;
prop . put ( "robots-allowed" , robotsEntry = = null ? 1 : robotsEntry . isDisallowed ( theURL ) ? 0 : 1 ) ;
prop . putHTML ( "robotsInfo" , robotsEntry = = null ? "" : robotsEntry . getInfo ( ) ) ;
// get the sitemap URL of the domain
final List < String > sitemaps = robotsEntry = = null ? new ArrayList < String > ( 0 ) : robotsEntry . getSitemaps ( ) ;
for ( int i = 0 ; i < sitemaps . size ( ) ; i + + ) {
prop . putXML ( "sitemaps_" + i + "_sitemap" , sitemaps . get ( i ) ) ;
}
prop . put ( "sitemaps" , sitemaps . size ( ) ) ;
} catch ( final MalformedURLException e ) {
ConcurrentLog . logException ( e ) ;
}
}
if ( actions . indexOf ( "oai" , 0 ) > = 0 ) {
try {
final DigestURL theURL = new DigestURL ( url + "?verb=Identify" ) ;
final String oairesult = checkOAI ( theURL . toNormalform ( false ) ) ;
prop . put ( "oai" , oairesult = = "" ? 0 : 1 ) ;
if ( oairesult ! = "" ) {
prop . putXML ( "title" , oairesult ) ;
}
} catch ( final MalformedURLException e ) {
}
}
/* Redirect to getpageinfo_p */
StringBuilder redirectedLocation ;
if ( header ! = null & & header . getPathInfo ( ) ! = null & & header . getPathInfo ( ) . endsWith ( ".json" ) ) {
redirectedLocation = new StringBuilder ( "getpageinfo_p.json" ) ;
} else {
redirectedLocation = new StringBuilder ( "getpageinfo_p.xml" ) ;
}
// return rewrite properties
return prop ;
}
private static String removelinebreaks ( String dc_title ) {
String newtitle = dc_title . replace ( "\r" , "" ) ;
newtitle = newtitle . replace ( "\n" , "" ) ;
newtitle = newtitle . replace ( "\r\n" , "" ) ;
return newtitle ;
}
private static String checkOAI ( final String url ) {
final DocumentBuilderFactory factory = DocumentBuilderFactory
. newInstance ( ) ;
try {
final DocumentBuilder builder = factory . newDocumentBuilder ( ) ;
return parseXML ( builder . parse ( url ) ) ;
} catch ( final ParserConfigurationException ex ) {
ConcurrentLog . logException ( ex ) ;
} catch ( final SAXException ex ) {
ConcurrentLog . logException ( ex ) ;
} catch ( final IOException ex ) {
ConcurrentLog . logException ( ex ) ;
}
return "" ;
}
private static String parseXML ( final Document doc ) {
String repositoryName = null ;
final NodeList items = doc . getDocumentElement ( ) . getElementsByTagName (
"Identify" ) ;
if ( items . getLength ( ) = = 0 ) {
return "" ;
}
for ( int i = 0 , n = items . getLength ( ) ; i < n ; + + i ) {
if ( ! "Identify" . equals ( items . item ( i ) . getNodeName ( ) ) )
continue ;
final NodeList currentNodeChildren = items . item ( i ) . getChildNodes ( ) ;
for ( int j = 0 , m = currentNodeChildren . getLength ( ) ; j < m ; + + j ) {
final Node currentNode = currentNodeChildren . item ( j ) ;
if ( "repositoryName" . equals ( currentNode . getNodeName ( ) ) ) {
repositoryName = currentNode . getFirstChild ( ) . getNodeValue ( ) ;
/* Append eventual request parameters to the redirected location */
if ( post ! = null ) {
List < Entry < String , String > > parameters = post . entrySet ( ) ;
if ( parameters ! = null & & ! parameters . isEmpty ( ) ) {
redirectedLocation . append ( "?" ) ;
for ( Entry < String , String > entry : parameters ) {
redirectedLocation . append ( entry . getKey ( ) ) . append ( "=" ) . append ( entry . getValue ( ) ) . append ( "&" ) ;
}
/* Remove trailing "&" */
redirectedLocation . setLength ( redirectedLocation . length ( ) - 1 ) ;
}
if ( repositoryName = = null ) {
return "" ;
}
}
return repositoryName ;
}
prop . put ( serverObjects . ACTION_LOCATION , redirectedLocation . toString ( ) ) ;
return prop ;
}
}