augmented browsing: replace htmlparser by jsoup, which is more stable

and reliable
pull/1/head
cominch 12 years ago
parent ad62609ec7
commit e2119f4e76

@ -69,5 +69,6 @@
<classpathentry kind="lib" path="lib/arq-2.8.7.jar"/>
<classpathentry kind="lib" path="lib/iri-0.8.jar"/>
<classpathentry kind="lib" path="lib/apache-solr-core-3.6.0.jar" sourcepath="/Volumes/Raptor/Data/workspace/apache-solr-3.6.0/src/java"/>
<classpathentry kind="lib" path="lib/jsoup-1.6.3.jar"/>
<classpathentry kind="output" path="gen"/>
</classpath>

@ -191,6 +191,7 @@
<pathelement location="${lib}/jetty-util-6.1.26-patched-JETTY-1340.jar" />
<pathelement location="${lib}/jsch-0.1.42.jar" />
<pathelement location="${lib}/json-simple-1.1.jar" />
<pathelement location="${lib}/jsoup-1.6.3.jar" />
<pathelement location="${lib}/log4j-1.2.16.jar" />
<pathelement location="${lib}/log4j-over-slf4j-1.6.1.jar" />
<pathelement location="${lib}/lucene-analyzers-3.6.0.jar" />

@ -508,6 +508,7 @@ proxyURL=false
proxyURL.access=127.0.0.1,0:0:0:0:0:0:0:1
# which urls to rewrite to /proxy.html?url=x (values: all, domainlist)
proxyURL.rewriteURLs=domainlist
proxyURL.useforresults=false
# From the 'IndexCreate' menu point you can also define a crawling start point.
# The crawling works the same way as the prefetch, but it is possible to

Binary file not shown.

@ -21,10 +21,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import org.htmlparser.Tag;
import org.htmlparser.Text;
import org.htmlparser.util.NodeList;
import org.htmlparser.visitors.NodeVisitor;
import org.jsoup.Jsoup;
import de.anomic.http.server.ServerSideIncludes;
@ -33,101 +30,6 @@ public class AugmentHtmlStream {
static RequestHeader globalrequestHeader;
/**
* creates a NodeVisitor which assigns a unique ID to every node
*
* @return customized NodeVisitor
*/
private static class VisitorAddUniqueID extends NodeVisitor {
private int counter;
public VisitorAddUniqueID() {
this.setCounter(0);
}
@Override
public void visitTag(Tag tag) {
if (tag.getAttribute("id") == null) {
this.setCounter(this.getCounter() + 1);
tag.setAttribute("id", "\"sci" + this.getCounter() + "\"");
}
if (tag instanceof org.htmlparser.tags.LinkTag) {
// Link
Log.logInfo("AUGMENTATION", tag.getAttribute("href"));
//LinkTag lt = (LinkTag)tag;
}
}
@Override
public void visitStringNode(Text string) {
}
public void setCounter(int counter) {
this.counter = counter;
}
public int getCounter() {
return this.counter;
}
}
/**
* creates a NodeVisitor which inspects the element if it contains useful
* text
*
* @return customized NodeVisitor
*/
private static class VisitorText extends NodeVisitor {
private int counter;
public VisitorText() {
this.counter = 0;
}
@Override
public void visitTag(Tag tag) {
// tag.setText(tag.getText()+" <span>augmented</span>");
// Node node = new org.htmlparser.nodes.TextNode(loadInternal("interactionparts/scibutton.html", globalrequestHeader));
// NodeList nl = tag.getChildren();
// nl.add (node);
// tag.setChildren(nl);
}
@Override
public void visitStringNode(Text string) {
// if (string.getParent() != null) {
//
// string.setText(string
// .getText()
// .replaceAll("und",
// "<a href=\"http://www.kit.edu/\" target=\"_blank\">KIT</a>"));
//
//
// }
}
public void setCounter(int counter) {
this.counter = counter;
}
public int getCounter() {
return this.counter;
}
}
/**
* send web page to external REFLECT web service
*
@ -244,158 +146,34 @@ public class AugmentHtmlStream {
// Add DOCTYPE if not present.
// This is required for IE to render position:absolute correctly.
if (sb.getConfigBool("augmentation.addDoctype", true) == true) {
if (sb.getConfigBool("augmentation.addDoctype", false) == true) {
Doc = processAddDoctype(Doc);
augmented = true;
}
if (sb.getConfigBool("augmentation.reparse", true) == true) {
NodeList list = new NodeList();
// Fill NodeList with parsed Document
try {
org.htmlparser.Parser par = new org.htmlparser.Parser();
par.setInputHTML(Doc);
list = par.parse(null);
Log.logInfo ("AUGMENTATION", url.toString());
} catch (Exception e) {
}
// Add Unique ID to every node element which has no id yet.
// This allows consistent interaction between client (browser) and
// back-end (data store) by providing "position awareness" in the
// document.
if (sb.getConfigBool("augmentation.reparse.adduniqueid", true) == true) {
try {
NodeVisitor visitorAddUniqueID = new AugmentHtmlStream.VisitorAddUniqueID();
list.visitAllNodesWith(visitorAddUniqueID);
} catch (Exception e) {
}
}
// Inspect on text tags
try {
NodeVisitor visitorText = new AugmentHtmlStream.VisitorText();
list.visitAllNodesWith(visitorText);
} catch (Exception e) {
}
// System.out.println("Starting augmentation for " + url);
// System.out.println("Content: " + Doc);
if (!(list == null)) {
// DOCUMENT IS MANIPULABLE BY HTML REWRITER
// SO SEND IT TO YACY PARSER
Document document = null;
try {
final StringReader stringReader = new StringReader(Doc);
InputStream inputStream = new InputStream() {
@Override
public int read() throws IOException {
return stringReader.read();
}
};
document = Document.mergeDocuments(
url,
"text/html",
TextParser.parseSource(url, "text/html", null,
data.length(), inputStream));
} catch (Exception e) {
}
if (document != null) {
if (document.dc_format() == "text/html") {
// SCI_TITLE = document.dc_title();
// SCI_CREATOR = document.dc_creator();
// SCI_DESCRIPTION = document.dc_description();
// SCI_IDENTIFIER = document.dc_identifier();
}
}
// ADD AUGMENTED HEADER INFORMATION
NodeList header = list.extractAllNodesThatMatch(
new org.htmlparser.filters.NodeClassFilter(
org.htmlparser.tags.HeadTag.class), true);
org.htmlparser.util.SimpleNodeIterator iterHeader = header
.elements();
while (iterHeader.hasMoreNodes()) {
org.htmlparser.tags.HeadTag ht = ((org.htmlparser.tags.HeadTag) iterHeader
.nextNode());
NodeList headchildren = ht.getChildren();
headchildren.add(new org.htmlparser.nodes.TextNode(loadInternal("env/templates/jqueryheader.template", requestHeader)));
headchildren.add(new org.htmlparser.nodes.TextNode("<script type='text/javascript'>"+loadInternal("interaction_elements/interaction.js", requestHeader)+"</script>"));
headchildren.add(new org.htmlparser.nodes.TextNode("<script type='text/javascript'>"+loadInternal("interaction_elements/interaction_metadata.js", requestHeader)+"</script>"));
augmented = true;
ht.setChildren(headchildren);
}
// ADD AUGMENTED BODY INFORMATION
NodeList body = list.extractAllNodesThatMatch(
new org.htmlparser.filters.NodeClassFilter(
org.htmlparser.tags.BodyTag.class), true);
org.htmlparser.util.SimpleNodeIterator iterBody = body
.elements();
while (iterBody.hasMoreNodes()) {
org.htmlparser.tags.BodyTag bt = ((org.htmlparser.tags.BodyTag) iterBody
.nextNode());
NodeList bodychildren = bt.getChildren();
bodychildren.add(new org.htmlparser.nodes.TextNode(loadInternal("interaction_elements/OverlayInteraction.html?action="+action+"&urlhash="+ ASCII.String(url.hash()) +"&url="+url.toNormalform(false, true), requestHeader)));
bodychildren.add(new org.htmlparser.nodes.TextNode(loadInternal("interaction_elements/Footer.html?action="+action+"&urlhash="+ ASCII.String(url.hash()) +"&url="+url.toNormalform(false, true), requestHeader)));
bt.setChildren(bodychildren);
augmented = true;
}
Doc = list.toHtml(true);
augmented = true;
} // not list = null
} // reparse
if (sb.getConfigBool("augmentation.reparse", false) == true) {
org.jsoup.nodes.Document d = Jsoup.parse(Doc);
d.title ("yacy - "+d.title());
if (sb.getConfigBool("interaction.overlayinteraction.enabled", false) == true) {
d.head().append (loadInternal("env/templates/jqueryheader.template", requestHeader));
d.head().append ("<script type='text/javascript'>"+loadInternal("interaction_elements/interaction.js", requestHeader)+"</script>");
d.head().append ("<script type='text/javascript'>"+loadInternal("interaction_elements/interaction_metadata.js", requestHeader)+"</script>");
d.body().append (loadInternal("interaction_elements/OverlayInteraction.html?action="+action+"&urlhash="+ ASCII.String(url.hash()) +"&url="+url.toNormalform(false, true), requestHeader));
d.body().append (loadInternal("interaction_elements/Footer.html?action="+action+"&urlhash="+ ASCII.String(url.hash()) +"&url="+url.toNormalform(false, true), requestHeader));
}
Doc = d.html();
augmented = true;
}
if (augmented) {

Loading…
Cancel
Save