augmentedParser add features and integrate external html parser to

modify existing web pages

Conflicts:
	addon/YaCy.app/Contents/Info.plist
	build.xml
pull/1/head
cominch 13 years ago committed by Michael Peter Christen
parent 9cbfc1a1c0
commit b21048892b

@ -49,5 +49,7 @@
<classpathentry kind="lib" path="lib/commons-compress-1.4.1.jar"/>
<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
<classpathentry kind="lib" path="lib/icu4j-core.jar"/>
<classpathentry kind="lib" path="lib/htmllexer.jar"/>
<classpathentry kind="lib" path="lib/htmlparser.jar"/>
<classpathentry kind="output" path="gen"/>
</classpath>

@ -50,6 +50,8 @@
<string>$JAVAROOT/lib/commons-logging-1.1.1.jar</string>
<string>$JAVAROOT/lib/fontbox-1.6.0.jar</string>
<string>$JAVAROOT/lib/geronimo-stax-api_1.0_spec-1.0.1.jar</string>
<string>$JAVAROOT/lib/htmllexer.jar</string>
<string>$JAVAROOT/lib/htmlparser.jar</string>
<string>$JAVAROOT/lib/httpclient-4.2.jar</string>
<string>$JAVAROOT/lib/httpcore-4.2.jar</string>
<string>$JAVAROOT/lib/httpmime-4.2.jar</string>

@ -168,6 +168,8 @@
<pathelement location="${lib}/commons-logging-1.1.1.jar" />
<pathelement location="${lib}/fontbox-1.6.0.jar" />
<pathelement location="${lib}/geronimo-stax-api_1.0_spec-1.0.1.jar" />
<pathelement location="${lib}/htmllexer.jar" />
<pathelement location="${lib}/htmlparser.jar" />
<pathelement location="${lib}/httpclient-4.2.jar" />
<pathelement location="${lib}/httpcore-4.2.jar" />
<pathelement location="${lib}/httpmime-4.2.jar" />

Binary file not shown.

Binary file not shown.

@ -74,7 +74,7 @@ public class ServerSideIncludes {
}
}
private static void writeContent(String path, final OutputStream out, final String authorization, final String requesthost, final RequestHeader requestHeader) {
public static void writeContent(String path, final OutputStream out, final String authorization, final String requesthost, final RequestHeader requestHeader) {
// check if there are arguments in path string
String args = "";
final int argpos = path.indexOf('?');

@ -1,21 +1,430 @@
package net.yacy.interaction;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import net.yacy.yacy;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.document.Document;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import org.htmlparser.Tag;
import org.htmlparser.Text;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.visitors.NodeVisitor;
import de.anomic.http.server.ServerSideIncludes;
public class AugmentHtmlStream {
static RequestHeader globalrequestHeader;
/**
* creates a NodeVisitor which assigns a unique ID to every node
*
* @return customized NodeVisitor
*/
private static class VisitorAddUniqueID extends NodeVisitor {
private int counter;
public VisitorAddUniqueID() {
this.setCounter(0);
}
@Override
public void visitTag(Tag tag) {
if (tag.getAttribute("id") == null) {
this.setCounter(this.getCounter() + 1);
tag.setAttribute("id", "\"sci" + this.getCounter() + "\"");
}
if (tag instanceof org.htmlparser.tags.LinkTag) {
// Link
Log.logInfo("AUGMENTATION", tag.getAttribute("href"));
LinkTag lt = (LinkTag)tag;
}
}
@Override
public void visitStringNode(Text string) {
}
public void setCounter(int counter) {
this.counter = counter;
}
public int getCounter() {
return this.counter;
}
}
/**
* creates a NodeVisitor which inspects the element if it contains useful
* text
*
* @return customized NodeVisitor
*/
private static class VisitorText extends NodeVisitor {
private int counter;
public VisitorText() {
this.setCounter(0);
}
@Override
public void visitTag(Tag tag) {
// tag.setText(tag.getText()+" <span>augmented</span>");
// Node node = new org.htmlparser.nodes.TextNode(loadInternal("interactionparts/scibutton.html", globalrequestHeader));
// NodeList nl = tag.getChildren();
// nl.add (node);
// tag.setChildren(nl);
}
@Override
public void visitStringNode(Text string) {
// if (string.getParent() != null) {
//
// string.setText(string
// .getText()
// .replaceAll("und",
// "<a href=\"http://www.kit.edu/\" target=\"_blank\">KIT</a>"));
//
//
// }
}
public void setCounter(int counter) {
this.counter = counter;
}
public int getCounter() {
return this.counter;
}
}
/**
* send web page to external REFLECT web service
*
* @return the web page with integrated REFLECT elements
*/
private static String processExternal(String url, String fieldname,
String data) throws IOException {
final HTTPClient client = new HTTPClient();
try {
StringBuilder postdata = new StringBuilder();
postdata.append("document=");
postdata.append(URLEncoder.encode(data, "UTF-8"));
InputStream in = new ByteArrayInputStream(postdata.toString()
.getBytes());
byte[] result = client.POSTbytes(url, in, postdata.length());
if (result != null) {
return new String(result);
}
} finally {
client.finish();
}
return null;
}
private static String loadInternal(String path, RequestHeader requestHeader) {
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
String realmProp = requestHeader.get(RequestHeader.AUTHORIZATION);
ServerSideIncludes.writeContent(path, buffer, realmProp, "127.0.0.1", requestHeader); // TODO: ip
return buffer.toString();
}
/**
* add DOCTYPE if necessary
*
* @return the web page with a leading DOCTYPE definition
*/
private static String processAddDoctype(String data) {
String result = data;
BufferedReader reader = new BufferedReader(new StringReader(data));
try {
String firstline = reader.readLine();
if (firstline != null) {
if (!firstline.startsWith("<!DOCTYPE")) {
result = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\n"
+ data;
}
}
} catch (IOException e1) {
}
return result;
}
/**
* load snippet from resource text file
*
* @return text from resource text file
*/
private static String loadPart(String part) {
String result = "";
try {
BufferedReader in = new BufferedReader(new FileReader(yacy.homedir + File.separatorChar + "htroot"
+ File.separatorChar + "interaction" + File.separatorChar
+ "parts" + File.separatorChar + part));
String str;
while ((str = in.readLine()) != null) {
result += str;
}
in.close();
} catch (IOException e) {
}
return result;
}
public static StringBuffer process (StringBuffer data, Charset charset, DigestURI url, RequestHeader requestHeader) {
globalrequestHeader = requestHeader;
Switchboard sb = Switchboard.getSwitchboard();
boolean augmented = false;
String Doc = data.toString();
// Send document to REFLECT (http://www.reflect.ws/REST_API.html)
if (sb.getConfigBool("augmentation.reflect", false) == true) {
try {
Doc = processExternal("http://reflect.ws/REST/GetHTML",
"document", Doc);
Log.logInfo("AUGMENTATION", "reflected " + url);
augmented = true;
} catch (Exception e) {
}
}
// Add DOCTYPE if not present.
// This is required for IE to render position:absolute correctly.
if (sb.getConfigBool("augmentation.addDoctype", true) == true) {
Doc = processAddDoctype(Doc);
augmented = true;
}
if (sb.getConfigBool("augmentation.reparse", true) == true) {
NodeList list = new NodeList();
// Fill NodeList with parsed Document
try {
org.htmlparser.Parser par = new org.htmlparser.Parser();
par.setInputHTML(Doc);
list = par.parse(null);
Log.logInfo ("AUGMENTATION", url.toString());
} catch (Exception e) {
}
// Add Unique ID to every node element which has no id yet.
// This allows consistent interaction between client (browser) and
// back-end (data store) by providing "position awareness" in the
// document.
if (sb.getConfigBool("augmentation.reparse.adduniqueid", true) == true) {
try {
NodeVisitor visitorAddUniqueID = new AugmentHtmlStream.VisitorAddUniqueID();
list.visitAllNodesWith(visitorAddUniqueID);
} catch (Exception e) {
}
}
// Inspect on text tags
try {
NodeVisitor visitorText = new AugmentHtmlStream.VisitorText();
list.visitAllNodesWith(visitorText);
} catch (Exception e) {
}
String SCI_GUID = "";
String SCI_GUID_DOI = "";
String SCI_GUID_PMID = "";
String SCI_TITLE = "";
String SCI_CREATOR = "";
String SCI_DESCRIPTION = "";
String SCI_IDENTIFIER = "";
String SCI_WHITELIST = "";
String SCI_URL = "";
String SCI_HASH = "";
SCI_URL = url.toString();
// System.out.println("Starting augmentation for " + url);
// System.out.println("Content: " + Doc);
if (!(list == null)) {
// DOCUMENT IS MANIPULABLE BY HTML REWRITER
// SO SEND IT TO YACY PARSER
Document document = null;
try {
final StringReader stringReader = new StringReader(Doc);
InputStream inputStream = new InputStream() {
@Override
public int read() throws IOException {
return stringReader.read();
}
};
document = Document.mergeDocuments(
url,
"text/html",
TextParser.parseSource(url, "text/html", null, data.length(), inputStream));
} catch (Exception e) {
}
if (document != null) {
if (document.dc_format() == "text/html") {
SCI_TITLE = document.dc_title();
SCI_CREATOR = document.dc_creator();
SCI_DESCRIPTION = document.dc_description();
SCI_IDENTIFIER = document.dc_identifier();
}
}
SCI_HASH = "" + url.hashCode();
// ADD AUGMENTED HEADER INFORMATION
NodeList header = list.extractAllNodesThatMatch(
new org.htmlparser.filters.NodeClassFilter(
org.htmlparser.tags.HeadTag.class), true);
org.htmlparser.util.SimpleNodeIterator iterHeader = header
.elements();
while (iterHeader.hasMoreNodes()) {
org.htmlparser.tags.HeadTag ht = ((org.htmlparser.tags.HeadTag) iterHeader
.nextNode());
NodeList headchildren = ht.getChildren();
headchildren.add(new org.htmlparser.nodes.TextNode(loadInternal("interactionparts/interaction.html", requestHeader)));
augmented = true;
ht.setChildren(headchildren);
}
// ADD AUGMENTED BODY INFORMATION
NodeList body = list.extractAllNodesThatMatch(
new org.htmlparser.filters.NodeClassFilter(
org.htmlparser.tags.BodyTag.class), true);
org.htmlparser.util.SimpleNodeIterator iterBody = body
.elements();
while (iterBody.hasMoreNodes()) {
org.htmlparser.tags.BodyTag bt = ((org.htmlparser.tags.BodyTag) iterBody
.nextNode());
NodeList bodychildren = bt.getChildren();
// ADD AUGMENTED INFO
org.htmlparser.tags.Div sci_aug = new org.htmlparser.tags.Div();
sci_aug.setTagName("div");
sci_aug.setAttribute("id", "sciety_augmented");
sci_aug.setAttribute("style",
"visibility: hidden; position: absolute; overflow: hidden;");
org.htmlparser.util.NodeList childr = new org.htmlparser.util.NodeList();
sci_aug.setChildren(childr);
org.htmlparser.tags.Div sci_aug_endtag = new org.htmlparser.tags.Div();
sci_aug_endtag.setTagName("/div");
sci_aug.setEndTag(sci_aug_endtag);
bodychildren.add(sci_aug);
bt.setChildren(bodychildren);
augmented = true;
}
Doc = list.toHtml(true);
augmented = true;
} // not list = null
} // reparse
if (augmented) {
return (new StringBuffer (Doc));
} else {
return (data);

@ -136,6 +136,7 @@ public final class yacy {
* {@link yacy#startup(String, long, long)} method.
*/
private static Switchboard sb = null;
public static String homedir;
/**
* Starts up the whole application. Sets up all datastructures and starts
@ -168,6 +169,8 @@ public final class yacy {
System.err.println("Error creating DATA-directory in " + dataHome.toString() + " . Please check your write-permission for this folder. YaCy will now terminate.");
System.exit(-1);
}
homedir = appHome.toString();
// setting up logging
f = new File(dataHome, "DATA/LOG/");

Loading…
Cancel
Save