package net.yacy.interaction; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.StringReader; import java.net.URLEncoder; import java.nio.charset.Charset; import net.yacy.yacy; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.document.Document; import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; import org.htmlparser.Tag; import org.htmlparser.Text; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeList; import org.htmlparser.visitors.NodeVisitor; import de.anomic.http.server.ServerSideIncludes; public class AugmentHtmlStream { static RequestHeader globalrequestHeader; /** * creates a NodeVisitor which assigns a unique ID to every node * * @return customized NodeVisitor */ private static class VisitorAddUniqueID extends NodeVisitor { private int counter; public VisitorAddUniqueID() { this.setCounter(0); } @Override public void visitTag(Tag tag) { if (tag.getAttribute("id") == null) { this.setCounter(this.getCounter() + 1); tag.setAttribute("id", "\"sci" + this.getCounter() + "\""); } if (tag instanceof org.htmlparser.tags.LinkTag) { // Link Log.logInfo("AUGMENTATION", tag.getAttribute("href")); LinkTag lt = (LinkTag)tag; } } @Override public void visitStringNode(Text string) { } public void setCounter(int counter) { this.counter = counter; } public int getCounter() { return this.counter; } } /** * creates a NodeVisitor which inspects the element if it contains useful * text * * @return customized NodeVisitor */ private static class VisitorText extends NodeVisitor { private int counter; public VisitorText() { this.setCounter(0); } @Override public void visitTag(Tag tag) { // tag.setText(tag.getText()+" augmented"); // Node node = new org.htmlparser.nodes.TextNode(loadInternal("interactionparts/scibutton.html", globalrequestHeader)); // NodeList nl = tag.getChildren(); // nl.add (node); // tag.setChildren(nl); } @Override public void visitStringNode(Text string) { // if (string.getParent() != null) { // // string.setText(string // .getText() // .replaceAll("und", // "KIT")); // // // } } public void setCounter(int counter) { this.counter = counter; } public int getCounter() { return this.counter; } } /** * send web page to external REFLECT web service * * @return the web page with integrated REFLECT elements */ private static String processExternal(String url, String fieldname, String data) throws IOException { final HTTPClient client = new HTTPClient(); try { StringBuilder postdata = new StringBuilder(); postdata.append("document="); postdata.append(URLEncoder.encode(data, "UTF-8")); InputStream in = new ByteArrayInputStream(postdata.toString() .getBytes()); byte[] result = client.POSTbytes(url, in, postdata.length()); if (result != null) { return new String(result); } } finally { client.finish(); } return null; } private static String loadInternal(String path, RequestHeader requestHeader) { ByteArrayOutputStream buffer = new ByteArrayOutputStream(); String realmProp = requestHeader.get(RequestHeader.AUTHORIZATION); ServerSideIncludes.writeContent(path, buffer, realmProp, "127.0.0.1", requestHeader); // TODO: ip return buffer.toString(); } /** * add DOCTYPE if necessary * * @return the web page with a leading DOCTYPE definition */ private static String processAddDoctype(String data) { String result = data; BufferedReader reader = new BufferedReader(new StringReader(data)); try { String firstline = reader.readLine(); if (firstline != null) { if (!firstline.startsWith("\n" + data; } } } catch (IOException e1) { } return result; } /** * load snippet from resource text file * * @return text from resource text file */ private static String loadPart(String part) { String result = ""; try { BufferedReader in = new BufferedReader(new FileReader(yacy.homedir + File.separatorChar + "htroot" + File.separatorChar + "interaction" + File.separatorChar + "parts" + File.separatorChar + part)); String str; while ((str = in.readLine()) != null) { result += str; } in.close(); } catch (IOException e) { } return result; } public static StringBuffer process (StringBuffer data, Charset charset, DigestURI url, RequestHeader requestHeader) { globalrequestHeader = requestHeader; Switchboard sb = Switchboard.getSwitchboard(); boolean augmented = false; String Doc = data.toString(); // Send document to REFLECT (http://www.reflect.ws/REST_API.html) if (sb.getConfigBool("augmentation.reflect", false) == true) { try { Doc = processExternal("http://reflect.ws/REST/GetHTML", "document", Doc); Log.logInfo("AUGMENTATION", "reflected " + url); augmented = true; } catch (Exception e) { } } // Add DOCTYPE if not present. // This is required for IE to render position:absolute correctly. if (sb.getConfigBool("augmentation.addDoctype", true) == true) { Doc = processAddDoctype(Doc); augmented = true; } if (sb.getConfigBool("augmentation.reparse", true) == true) { NodeList list = new NodeList(); // Fill NodeList with parsed Document try { org.htmlparser.Parser par = new org.htmlparser.Parser(); par.setInputHTML(Doc); list = par.parse(null); Log.logInfo ("AUGMENTATION", url.toString()); } catch (Exception e) { } // Add Unique ID to every node element which has no id yet. // This allows consistent interaction between client (browser) and // back-end (data store) by providing "position awareness" in the // document. if (sb.getConfigBool("augmentation.reparse.adduniqueid", true) == true) { try { NodeVisitor visitorAddUniqueID = new AugmentHtmlStream.VisitorAddUniqueID(); list.visitAllNodesWith(visitorAddUniqueID); } catch (Exception e) { } } // Inspect on text tags try { NodeVisitor visitorText = new AugmentHtmlStream.VisitorText(); list.visitAllNodesWith(visitorText); } catch (Exception e) { } String SCI_GUID = ""; String SCI_GUID_DOI = ""; String SCI_GUID_PMID = ""; String SCI_TITLE = ""; String SCI_CREATOR = ""; String SCI_DESCRIPTION = ""; String SCI_IDENTIFIER = ""; String SCI_WHITELIST = ""; String SCI_URL = ""; String SCI_HASH = ""; SCI_URL = url.toString(); // System.out.println("Starting augmentation for " + url); // System.out.println("Content: " + Doc); if (!(list == null)) { // DOCUMENT IS MANIPULABLE BY HTML REWRITER // SO SEND IT TO YACY PARSER Document document = null; try { final StringReader stringReader = new StringReader(Doc); InputStream inputStream = new InputStream() { @Override public int read() throws IOException { return stringReader.read(); } }; document = Document.mergeDocuments( url, "text/html", TextParser.parseSource(url, "text/html", null, data.length(), inputStream)); } catch (Exception e) { } if (document != null) { if (document.dc_format() == "text/html") { SCI_TITLE = document.dc_title(); SCI_CREATOR = document.dc_creator(); SCI_DESCRIPTION = document.dc_description(); SCI_IDENTIFIER = document.dc_identifier(); } } SCI_HASH = "" + url.hashCode(); // ADD AUGMENTED HEADER INFORMATION NodeList header = list.extractAllNodesThatMatch( new org.htmlparser.filters.NodeClassFilter( org.htmlparser.tags.HeadTag.class), true); org.htmlparser.util.SimpleNodeIterator iterHeader = header .elements(); while (iterHeader.hasMoreNodes()) { org.htmlparser.tags.HeadTag ht = ((org.htmlparser.tags.HeadTag) iterHeader .nextNode()); NodeList headchildren = ht.getChildren(); headchildren.add(new org.htmlparser.nodes.TextNode(loadInternal("interactionparts/interaction.html", requestHeader))); augmented = true; ht.setChildren(headchildren); } // ADD AUGMENTED BODY INFORMATION NodeList body = list.extractAllNodesThatMatch( new org.htmlparser.filters.NodeClassFilter( org.htmlparser.tags.BodyTag.class), true); org.htmlparser.util.SimpleNodeIterator iterBody = body .elements(); while (iterBody.hasMoreNodes()) { org.htmlparser.tags.BodyTag bt = ((org.htmlparser.tags.BodyTag) iterBody .nextNode()); NodeList bodychildren = bt.getChildren(); // ADD AUGMENTED INFO org.htmlparser.tags.Div sci_aug = new org.htmlparser.tags.Div(); sci_aug.setTagName("div"); sci_aug.setAttribute("id", "sciety_augmented"); sci_aug.setAttribute("style", "visibility: hidden; position: absolute; overflow: hidden;"); org.htmlparser.util.NodeList childr = new org.htmlparser.util.NodeList(); sci_aug.setChildren(childr); org.htmlparser.tags.Div sci_aug_endtag = new org.htmlparser.tags.Div(); sci_aug_endtag.setTagName("/div"); sci_aug.setEndTag(sci_aug_endtag); bodychildren.add(sci_aug); bt.setChildren(bodychildren); augmented = true; } Doc = list.toHtml(true); augmented = true; } // not list = null } // reparse if (augmented) { return (new StringBuffer (Doc)); } else { return (data); } } }