|
|
@ -46,19 +46,21 @@
|
|
|
|
// if the shell's current path is HTROOT
|
|
|
|
// if the shell's current path is HTROOT
|
|
|
|
package xml.util;
|
|
|
|
package xml.util;
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.io.IOException;
|
|
|
|
|
|
|
|
import java.io.Writer;
|
|
|
|
import java.net.MalformedURLException;
|
|
|
|
import java.net.MalformedURLException;
|
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.Iterator;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import de.anomic.data.htmlTools;
|
|
|
|
|
|
|
|
import de.anomic.data.robotsParser;
|
|
|
|
import de.anomic.data.robotsParser;
|
|
|
|
|
|
|
|
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
|
|
|
|
|
|
|
import de.anomic.htmlFilter.htmlFilterWriter;
|
|
|
|
import de.anomic.http.httpHeader;
|
|
|
|
import de.anomic.http.httpHeader;
|
|
|
|
import de.anomic.http.httpc;
|
|
|
|
import de.anomic.http.httpc;
|
|
|
|
import de.anomic.net.URL;
|
|
|
|
import de.anomic.net.URL;
|
|
|
|
import de.anomic.plasma.plasmaSwitchboard;
|
|
|
|
import de.anomic.plasma.plasmaSwitchboard;
|
|
|
|
|
|
|
|
import de.anomic.server.serverFileUtils;
|
|
|
|
import de.anomic.server.serverObjects;
|
|
|
|
import de.anomic.server.serverObjects;
|
|
|
|
import de.anomic.server.serverSwitch;
|
|
|
|
import de.anomic.server.serverSwitch;
|
|
|
|
import de.anomic.tools.nxTools;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public class getpageinfo_p {
|
|
|
|
public class getpageinfo_p {
|
|
|
|
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
|
|
|
|
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
|
|
|
@ -81,22 +83,20 @@ public class getpageinfo_p {
|
|
|
|
if (actions.indexOf("title")>=0) {
|
|
|
|
if (actions.indexOf("title")>=0) {
|
|
|
|
try {
|
|
|
|
try {
|
|
|
|
URL u = new URL(url);
|
|
|
|
URL u = new URL(url);
|
|
|
|
content = nxTools.strings(httpc.wget(u, u.getHost(), 6000, null, null, ((plasmaSwitchboard) env).remoteProxyConfig, null));
|
|
|
|
String contentString=new String(httpc.wget(u, u.getHost(), 6000, null, null, ((plasmaSwitchboard) env).remoteProxyConfig, null)) ;
|
|
|
|
Iterator it = content.iterator();
|
|
|
|
|
|
|
|
String line;
|
|
|
|
htmlFilterContentScraper scraper = new htmlFilterContentScraper(u);
|
|
|
|
String title;
|
|
|
|
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
|
|
|
|
while (it.hasNext()) {
|
|
|
|
Writer writer = new htmlFilterWriter(null,null,scraper,null,false);
|
|
|
|
line = (String) it.next();
|
|
|
|
serverFileUtils.write(contentString,writer);
|
|
|
|
try {
|
|
|
|
writer.close();
|
|
|
|
title = line.substring(line.toLowerCase().indexOf(
|
|
|
|
|
|
|
|
"<title>") + 7, line.toLowerCase().indexOf(
|
|
|
|
prop.put("title", scraper.getTitle());
|
|
|
|
"</title>"));
|
|
|
|
String list[]=scraper.getKeywords();
|
|
|
|
// de-replace html entities
|
|
|
|
for(int i=0;i<list.length;i++){
|
|
|
|
title = htmlTools.deReplaceHTML(title);
|
|
|
|
prop.putSafeXML("tags_"+i+"_tag", list[i]);
|
|
|
|
prop.put("title", title);
|
|
|
|
|
|
|
|
} catch (IndexOutOfBoundsException e) {
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
prop.put("tags", list.length);
|
|
|
|
|
|
|
|
|
|
|
|
} catch (MalformedURLException e) {
|
|
|
|
} catch (MalformedURLException e) {
|
|
|
|
} catch (IOException e) {
|
|
|
|
} catch (IOException e) {
|
|
|
@ -116,7 +116,9 @@ public class getpageinfo_p {
|
|
|
|
// get the sitemap URL of the domain
|
|
|
|
// get the sitemap URL of the domain
|
|
|
|
URL sitemapURL = robotsParser.getSitemapURL(theURL);
|
|
|
|
URL sitemapURL = robotsParser.getSitemapURL(theURL);
|
|
|
|
prop.put("sitemap", (sitemapURL==null)?"":sitemapURL.toString());
|
|
|
|
prop.put("sitemap", (sitemapURL==null)?"":sitemapURL.toString());
|
|
|
|
} catch (MalformedURLException e) {}
|
|
|
|
} catch (MalformedURLException e) {
|
|
|
|
|
|
|
|
prop.put("sitemap", "");
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|