|
|
|
@ -21,10 +21,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
|
|
|
|
|
import net.yacy.kelondro.logging.Log;
|
|
|
|
|
import net.yacy.search.Switchboard;
|
|
|
|
|
|
|
|
|
|
import org.htmlparser.Tag;
|
|
|
|
|
import org.htmlparser.Text;
|
|
|
|
|
import org.htmlparser.util.NodeList;
|
|
|
|
|
import org.htmlparser.visitors.NodeVisitor;
|
|
|
|
|
import org.jsoup.Jsoup;
|
|
|
|
|
|
|
|
|
|
import de.anomic.http.server.ServerSideIncludes;
|
|
|
|
|
|
|
|
|
@ -33,101 +30,6 @@ public class AugmentHtmlStream {
|
|
|
|
|
|
|
|
|
|
static RequestHeader globalrequestHeader;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* creates a NodeVisitor which assigns a unique ID to every node
|
|
|
|
|
*
|
|
|
|
|
* @return customized NodeVisitor
|
|
|
|
|
*/
|
|
|
|
|
private static class VisitorAddUniqueID extends NodeVisitor {
|
|
|
|
|
|
|
|
|
|
private int counter;
|
|
|
|
|
|
|
|
|
|
public VisitorAddUniqueID() {
|
|
|
|
|
this.setCounter(0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void visitTag(Tag tag) {
|
|
|
|
|
if (tag.getAttribute("id") == null) {
|
|
|
|
|
this.setCounter(this.getCounter() + 1);
|
|
|
|
|
tag.setAttribute("id", "\"sci" + this.getCounter() + "\"");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (tag instanceof org.htmlparser.tags.LinkTag) {
|
|
|
|
|
// Link
|
|
|
|
|
Log.logInfo("AUGMENTATION", tag.getAttribute("href"));
|
|
|
|
|
|
|
|
|
|
//LinkTag lt = (LinkTag)tag;
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void visitStringNode(Text string) {
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void setCounter(int counter) {
|
|
|
|
|
this.counter = counter;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public int getCounter() {
|
|
|
|
|
return this.counter;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* creates a NodeVisitor which inspects the element if it contains useful
|
|
|
|
|
* text
|
|
|
|
|
*
|
|
|
|
|
* @return customized NodeVisitor
|
|
|
|
|
*/
|
|
|
|
|
private static class VisitorText extends NodeVisitor {
|
|
|
|
|
|
|
|
|
|
private int counter;
|
|
|
|
|
|
|
|
|
|
public VisitorText() {
|
|
|
|
|
this.counter = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void visitTag(Tag tag) {
|
|
|
|
|
|
|
|
|
|
// tag.setText(tag.getText()+" <span>augmented</span>");
|
|
|
|
|
|
|
|
|
|
// Node node = new org.htmlparser.nodes.TextNode(loadInternal("interactionparts/scibutton.html", globalrequestHeader));
|
|
|
|
|
// NodeList nl = tag.getChildren();
|
|
|
|
|
// nl.add (node);
|
|
|
|
|
// tag.setChildren(nl);
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void visitStringNode(Text string) {
|
|
|
|
|
|
|
|
|
|
// if (string.getParent() != null) {
|
|
|
|
|
//
|
|
|
|
|
// string.setText(string
|
|
|
|
|
// .getText()
|
|
|
|
|
// .replaceAll("und",
|
|
|
|
|
// "<a href=\"http://www.kit.edu/\" target=\"_blank\">KIT</a>"));
|
|
|
|
|
//
|
|
|
|
|
//
|
|
|
|
|
// }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void setCounter(int counter) {
|
|
|
|
|
this.counter = counter;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public int getCounter() {
|
|
|
|
|
return this.counter;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* send web page to external REFLECT web service
|
|
|
|
|
*
|
|
|
|
@ -244,158 +146,34 @@ public class AugmentHtmlStream {
|
|
|
|
|
// Add DOCTYPE if not present.
|
|
|
|
|
// This is required for IE to render position:absolute correctly.
|
|
|
|
|
|
|
|
|
|
if (sb.getConfigBool("augmentation.addDoctype", true) == true) {
|
|
|
|
|
if (sb.getConfigBool("augmentation.addDoctype", false) == true) {
|
|
|
|
|
Doc = processAddDoctype(Doc);
|
|
|
|
|
augmented = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (sb.getConfigBool("augmentation.reparse", true) == true) {
|
|
|
|
|
|
|
|
|
|
NodeList list = new NodeList();
|
|
|
|
|
|
|
|
|
|
// Fill NodeList with parsed Document
|
|
|
|
|
try {
|
|
|
|
|
|
|
|
|
|
org.htmlparser.Parser par = new org.htmlparser.Parser();
|
|
|
|
|
|
|
|
|
|
par.setInputHTML(Doc);
|
|
|
|
|
|
|
|
|
|
list = par.parse(null);
|
|
|
|
|
|
|
|
|
|
Log.logInfo ("AUGMENTATION", url.toString());
|
|
|
|
|
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Add Unique ID to every node element which has no id yet.
|
|
|
|
|
// This allows consistent interaction between client (browser) and
|
|
|
|
|
// back-end (data store) by providing "position awareness" in the
|
|
|
|
|
// document.
|
|
|
|
|
if (sb.getConfigBool("augmentation.reparse.adduniqueid", true) == true) {
|
|
|
|
|
try {
|
|
|
|
|
|
|
|
|
|
NodeVisitor visitorAddUniqueID = new AugmentHtmlStream.VisitorAddUniqueID();
|
|
|
|
|
list.visitAllNodesWith(visitorAddUniqueID);
|
|
|
|
|
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Inspect on text tags
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
|
|
|
|
|
NodeVisitor visitorText = new AugmentHtmlStream.VisitorText();
|
|
|
|
|
list.visitAllNodesWith(visitorText);
|
|
|
|
|
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// System.out.println("Starting augmentation for " + url);
|
|
|
|
|
// System.out.println("Content: " + Doc);
|
|
|
|
|
|
|
|
|
|
if (!(list == null)) {
|
|
|
|
|
|
|
|
|
|
// DOCUMENT IS MANIPULABLE BY HTML REWRITER
|
|
|
|
|
|
|
|
|
|
// SO SEND IT TO YACY PARSER
|
|
|
|
|
|
|
|
|
|
Document document = null;
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
final StringReader stringReader = new StringReader(Doc);
|
|
|
|
|
InputStream inputStream = new InputStream() {
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public int read() throws IOException {
|
|
|
|
|
return stringReader.read();
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
document = Document.mergeDocuments(
|
|
|
|
|
url,
|
|
|
|
|
"text/html",
|
|
|
|
|
TextParser.parseSource(url, "text/html", null,
|
|
|
|
|
data.length(), inputStream));
|
|
|
|
|
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (document != null) {
|
|
|
|
|
|
|
|
|
|
if (document.dc_format() == "text/html") {
|
|
|
|
|
|
|
|
|
|
// SCI_TITLE = document.dc_title();
|
|
|
|
|
// SCI_CREATOR = document.dc_creator();
|
|
|
|
|
// SCI_DESCRIPTION = document.dc_description();
|
|
|
|
|
// SCI_IDENTIFIER = document.dc_identifier();
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// ADD AUGMENTED HEADER INFORMATION
|
|
|
|
|
|
|
|
|
|
NodeList header = list.extractAllNodesThatMatch(
|
|
|
|
|
new org.htmlparser.filters.NodeClassFilter(
|
|
|
|
|
org.htmlparser.tags.HeadTag.class), true);
|
|
|
|
|
|
|
|
|
|
org.htmlparser.util.SimpleNodeIterator iterHeader = header
|
|
|
|
|
.elements();
|
|
|
|
|
|
|
|
|
|
while (iterHeader.hasMoreNodes()) {
|
|
|
|
|
org.htmlparser.tags.HeadTag ht = ((org.htmlparser.tags.HeadTag) iterHeader
|
|
|
|
|
.nextNode());
|
|
|
|
|
|
|
|
|
|
NodeList headchildren = ht.getChildren();
|
|
|
|
|
|
|
|
|
|
headchildren.add(new org.htmlparser.nodes.TextNode(loadInternal("env/templates/jqueryheader.template", requestHeader)));
|
|
|
|
|
headchildren.add(new org.htmlparser.nodes.TextNode("<script type='text/javascript'>"+loadInternal("interaction_elements/interaction.js", requestHeader)+"</script>"));
|
|
|
|
|
headchildren.add(new org.htmlparser.nodes.TextNode("<script type='text/javascript'>"+loadInternal("interaction_elements/interaction_metadata.js", requestHeader)+"</script>"));
|
|
|
|
|
|
|
|
|
|
augmented = true;
|
|
|
|
|
|
|
|
|
|
ht.setChildren(headchildren);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ADD AUGMENTED BODY INFORMATION
|
|
|
|
|
|
|
|
|
|
NodeList body = list.extractAllNodesThatMatch(
|
|
|
|
|
new org.htmlparser.filters.NodeClassFilter(
|
|
|
|
|
org.htmlparser.tags.BodyTag.class), true);
|
|
|
|
|
|
|
|
|
|
org.htmlparser.util.SimpleNodeIterator iterBody = body
|
|
|
|
|
.elements();
|
|
|
|
|
|
|
|
|
|
while (iterBody.hasMoreNodes()) {
|
|
|
|
|
|
|
|
|
|
org.htmlparser.tags.BodyTag bt = ((org.htmlparser.tags.BodyTag) iterBody
|
|
|
|
|
.nextNode());
|
|
|
|
|
|
|
|
|
|
NodeList bodychildren = bt.getChildren();
|
|
|
|
|
|
|
|
|
|
bodychildren.add(new org.htmlparser.nodes.TextNode(loadInternal("interaction_elements/OverlayInteraction.html?action="+action+"&urlhash="+ ASCII.String(url.hash()) +"&url="+url.toNormalform(false, true), requestHeader)));
|
|
|
|
|
|
|
|
|
|
bodychildren.add(new org.htmlparser.nodes.TextNode(loadInternal("interaction_elements/Footer.html?action="+action+"&urlhash="+ ASCII.String(url.hash()) +"&url="+url.toNormalform(false, true), requestHeader)));
|
|
|
|
|
|
|
|
|
|
bt.setChildren(bodychildren);
|
|
|
|
|
|
|
|
|
|
augmented = true;
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Doc = list.toHtml(true);
|
|
|
|
|
|
|
|
|
|
augmented = true;
|
|
|
|
|
|
|
|
|
|
} // not list = null
|
|
|
|
|
|
|
|
|
|
} // reparse
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (sb.getConfigBool("augmentation.reparse", false) == true) {
|
|
|
|
|
|
|
|
|
|
org.jsoup.nodes.Document d = Jsoup.parse(Doc);
|
|
|
|
|
|
|
|
|
|
d.title ("yacy - "+d.title());
|
|
|
|
|
|
|
|
|
|
if (sb.getConfigBool("interaction.overlayinteraction.enabled", false) == true) {
|
|
|
|
|
|
|
|
|
|
d.head().append (loadInternal("env/templates/jqueryheader.template", requestHeader));
|
|
|
|
|
d.head().append ("<script type='text/javascript'>"+loadInternal("interaction_elements/interaction.js", requestHeader)+"</script>");
|
|
|
|
|
d.head().append ("<script type='text/javascript'>"+loadInternal("interaction_elements/interaction_metadata.js", requestHeader)+"</script>");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
d.body().append (loadInternal("interaction_elements/OverlayInteraction.html?action="+action+"&urlhash="+ ASCII.String(url.hash()) +"&url="+url.toNormalform(false, true), requestHeader));
|
|
|
|
|
d.body().append (loadInternal("interaction_elements/Footer.html?action="+action+"&urlhash="+ ASCII.String(url.hash()) +"&url="+url.toNormalform(false, true), requestHeader));
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Doc = d.html();
|
|
|
|
|
|
|
|
|
|
augmented = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (augmented) {
|
|
|
|
|