@ -1,21 +1,430 @@
package net.yacy.interaction ;
import java.io.BufferedReader ;
import java.io.ByteArrayInputStream ;
import java.io.ByteArrayOutputStream ;
import java.io.File ;
import java.io.FileReader ;
import java.io.IOException ;
import java.io.InputStream ;
import java.io.StringReader ;
import java.net.URLEncoder ;
import java.nio.charset.Charset ;
import net.yacy.yacy ;
import net.yacy.cora.protocol.RequestHeader ;
import net.yacy.cora.protocol.http.HTTPClient ;
import net.yacy.document.Document ;
import net.yacy.document.TextParser ;
import net.yacy.kelondro.data.meta.DigestURI ;
import net.yacy.kelondro.logging.Log ;
import net.yacy.search.Switchboard ;
import org.htmlparser.Tag ;
import org.htmlparser.Text ;
import org.htmlparser.tags.LinkTag ;
import org.htmlparser.util.NodeList ;
import org.htmlparser.visitors.NodeVisitor ;
import de.anomic.http.server.ServerSideIncludes ;
public class AugmentHtmlStream {
static RequestHeader globalrequestHeader ;
/ * *
* creates a NodeVisitor which assigns a unique ID to every node
*
* @return customized NodeVisitor
* /
private static class VisitorAddUniqueID extends NodeVisitor {
private int counter ;
public VisitorAddUniqueID ( ) {
this . setCounter ( 0 ) ;
}
@Override
public void visitTag ( Tag tag ) {
if ( tag . getAttribute ( "id" ) = = null ) {
this . setCounter ( this . getCounter ( ) + 1 ) ;
tag . setAttribute ( "id" , "\"sci" + this . getCounter ( ) + "\"" ) ;
}
if ( tag instanceof org . htmlparser . tags . LinkTag ) {
// Link
Log . logInfo ( "AUGMENTATION" , tag . getAttribute ( "href" ) ) ;
LinkTag lt = ( LinkTag ) tag ;
}
}
@Override
public void visitStringNode ( Text string ) {
}
public void setCounter ( int counter ) {
this . counter = counter ;
}
public int getCounter ( ) {
return this . counter ;
}
}
/ * *
* creates a NodeVisitor which inspects the element if it contains useful
* text
*
* @return customized NodeVisitor
* /
private static class VisitorText extends NodeVisitor {
private int counter ;
public VisitorText ( ) {
this . setCounter ( 0 ) ;
}
@Override
public void visitTag ( Tag tag ) {
// tag.setText(tag.getText()+" <span>augmented</span>");
// Node node = new org.htmlparser.nodes.TextNode(loadInternal("interactionparts/scibutton.html", globalrequestHeader));
// NodeList nl = tag.getChildren();
// nl.add (node);
// tag.setChildren(nl);
}
@Override
public void visitStringNode ( Text string ) {
// if (string.getParent() != null) {
//
// string.setText(string
// .getText()
// .replaceAll("und",
// "<a href=\"http://www.kit.edu/\" target=\"_blank\">KIT</a>"));
//
//
// }
}
public void setCounter ( int counter ) {
this . counter = counter ;
}
public int getCounter ( ) {
return this . counter ;
}
}
/ * *
* send web page to external REFLECT web service
*
* @return the web page with integrated REFLECT elements
* /
private static String processExternal ( String url , String fieldname ,
String data ) throws IOException {
final HTTPClient client = new HTTPClient ( ) ;
try {
StringBuilder postdata = new StringBuilder ( ) ;
postdata . append ( "document=" ) ;
postdata . append ( URLEncoder . encode ( data , "UTF-8" ) ) ;
InputStream in = new ByteArrayInputStream ( postdata . toString ( )
. getBytes ( ) ) ;
byte [ ] result = client . POSTbytes ( url , in , postdata . length ( ) ) ;
if ( result ! = null ) {
return new String ( result ) ;
}
} finally {
client . finish ( ) ;
}
return null ;
}
private static String loadInternal ( String path , RequestHeader requestHeader ) {
ByteArrayOutputStream buffer = new ByteArrayOutputStream ( ) ;
String realmProp = requestHeader . get ( RequestHeader . AUTHORIZATION ) ;
ServerSideIncludes . writeContent ( path , buffer , realmProp , "127.0.0.1" , requestHeader ) ; // TODO: ip
return buffer . toString ( ) ;
}
/ * *
* add DOCTYPE if necessary
*
* @return the web page with a leading DOCTYPE definition
* /
private static String processAddDoctype ( String data ) {
String result = data ;
BufferedReader reader = new BufferedReader ( new StringReader ( data ) ) ;
try {
String firstline = reader . readLine ( ) ;
if ( firstline ! = null ) {
if ( ! firstline . startsWith ( "<!DOCTYPE" ) ) {
result = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\n"
+ data ;
}
}
} catch ( IOException e1 ) {
}
return result ;
}
/ * *
* load snippet from resource text file
*
* @return text from resource text file
* /
private static String loadPart ( String part ) {
String result = "" ;
try {
BufferedReader in = new BufferedReader ( new FileReader ( yacy . homedir + File . separatorChar + "htroot"
+ File . separatorChar + "interaction" + File . separatorChar
+ "parts" + File . separatorChar + part ) ) ;
String str ;
while ( ( str = in . readLine ( ) ) ! = null ) {
result + = str ;
}
in . close ( ) ;
} catch ( IOException e ) {
}
return result ;
}
public static StringBuffer process ( StringBuffer data , Charset charset , DigestURI url , RequestHeader requestHeader ) {
globalrequestHeader = requestHeader ;
Switchboard sb = Switchboard . getSwitchboard ( ) ;
boolean augmented = false ;
String Doc = data . toString ( ) ;
// Send document to REFLECT (http://www.reflect.ws/REST_API.html)
if ( sb . getConfigBool ( "augmentation.reflect" , false ) = = true ) {
try {
Doc = processExternal ( "http://reflect.ws/REST/GetHTML" ,
"document" , Doc ) ;
Log . logInfo ( "AUGMENTATION" , "reflected " + url ) ;
augmented = true ;
} catch ( Exception e ) {
}
}
// Add DOCTYPE if not present.
// This is required for IE to render position:absolute correctly.
if ( sb . getConfigBool ( "augmentation.addDoctype" , true ) = = true ) {
Doc = processAddDoctype ( Doc ) ;
augmented = true ;
}
if ( sb . getConfigBool ( "augmentation.reparse" , true ) = = true ) {
NodeList list = new NodeList ( ) ;
// Fill NodeList with parsed Document
try {
org . htmlparser . Parser par = new org . htmlparser . Parser ( ) ;
par . setInputHTML ( Doc ) ;
list = par . parse ( null ) ;
Log . logInfo ( "AUGMENTATION" , url . toString ( ) ) ;
} catch ( Exception e ) {
}
// Add Unique ID to every node element which has no id yet.
// This allows consistent interaction between client (browser) and
// back-end (data store) by providing "position awareness" in the
// document.
if ( sb . getConfigBool ( "augmentation.reparse.adduniqueid" , true ) = = true ) {
try {
NodeVisitor visitorAddUniqueID = new AugmentHtmlStream . VisitorAddUniqueID ( ) ;
list . visitAllNodesWith ( visitorAddUniqueID ) ;
} catch ( Exception e ) {
}
}
// Inspect on text tags
try {
NodeVisitor visitorText = new AugmentHtmlStream . VisitorText ( ) ;
list . visitAllNodesWith ( visitorText ) ;
} catch ( Exception e ) {
}
String SCI_GUID = "" ;
String SCI_GUID_DOI = "" ;
String SCI_GUID_PMID = "" ;
String SCI_TITLE = "" ;
String SCI_CREATOR = "" ;
String SCI_DESCRIPTION = "" ;
String SCI_IDENTIFIER = "" ;
String SCI_WHITELIST = "" ;
String SCI_URL = "" ;
String SCI_HASH = "" ;
SCI_URL = url . toString ( ) ;
// System.out.println("Starting augmentation for " + url);
// System.out.println("Content: " + Doc);
if ( ! ( list = = null ) ) {
// DOCUMENT IS MANIPULABLE BY HTML REWRITER
// SO SEND IT TO YACY PARSER
Document document = null ;
try {
final StringReader stringReader = new StringReader ( Doc ) ;
InputStream inputStream = new InputStream ( ) {
@Override
public int read ( ) throws IOException {
return stringReader . read ( ) ;
}
} ;
document = Document . mergeDocuments (
url ,
"text/html" ,
TextParser . parseSource ( url , "text/html" , null , data . length ( ) , inputStream ) ) ;
} catch ( Exception e ) {
}
if ( document ! = null ) {
if ( document . dc_format ( ) = = "text/html" ) {
SCI_TITLE = document . dc_title ( ) ;
SCI_CREATOR = document . dc_creator ( ) ;
SCI_DESCRIPTION = document . dc_description ( ) ;
SCI_IDENTIFIER = document . dc_identifier ( ) ;
}
}
SCI_HASH = "" + url . hashCode ( ) ;
// ADD AUGMENTED HEADER INFORMATION
NodeList header = list . extractAllNodesThatMatch (
new org . htmlparser . filters . NodeClassFilter (
org . htmlparser . tags . HeadTag . class ) , true ) ;
org . htmlparser . util . SimpleNodeIterator iterHeader = header
. elements ( ) ;
while ( iterHeader . hasMoreNodes ( ) ) {
org . htmlparser . tags . HeadTag ht = ( ( org . htmlparser . tags . HeadTag ) iterHeader
. nextNode ( ) ) ;
NodeList headchildren = ht . getChildren ( ) ;
headchildren . add ( new org . htmlparser . nodes . TextNode ( loadInternal ( "interactionparts/interaction.html" , requestHeader ) ) ) ;
augmented = true ;
ht . setChildren ( headchildren ) ;
}
// ADD AUGMENTED BODY INFORMATION
NodeList body = list . extractAllNodesThatMatch (
new org . htmlparser . filters . NodeClassFilter (
org . htmlparser . tags . BodyTag . class ) , true ) ;
org . htmlparser . util . SimpleNodeIterator iterBody = body
. elements ( ) ;
while ( iterBody . hasMoreNodes ( ) ) {
org . htmlparser . tags . BodyTag bt = ( ( org . htmlparser . tags . BodyTag ) iterBody
. nextNode ( ) ) ;
NodeList bodychildren = bt . getChildren ( ) ;
// ADD AUGMENTED INFO
org . htmlparser . tags . Div sci_aug = new org . htmlparser . tags . Div ( ) ;
sci_aug . setTagName ( "div" ) ;
sci_aug . setAttribute ( "id" , "sciety_augmented" ) ;
sci_aug . setAttribute ( "style" ,
"visibility: hidden; position: absolute; overflow: hidden;" ) ;
org . htmlparser . util . NodeList childr = new org . htmlparser . util . NodeList ( ) ;
sci_aug . setChildren ( childr ) ;
org . htmlparser . tags . Div sci_aug_endtag = new org . htmlparser . tags . Div ( ) ;
sci_aug_endtag . setTagName ( "/div" ) ;
sci_aug . setEndTag ( sci_aug_endtag ) ;
bodychildren . add ( sci_aug ) ;
bt . setChildren ( bodychildren ) ;
augmented = true ;
}
Doc = list . toHtml ( true ) ;
augmented = true ;
} // not list = null
} // reparse
if ( augmented ) {
return ( new StringBuffer ( Doc ) ) ;
} else {
return ( data ) ;