You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yacy_search_server/htroot/CrawlResults.java

266 lines
13 KiB

// CrawlResults.java
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 09.03.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.nxTools;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
public class CrawlResults {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
// return variable that accumulates replacements
final plasmaSwitchboard sb = (plasmaSwitchboard) env;
final serverObjects prop = new serverObjects();
int lines = 500;
boolean showControl = env.getConfigBool("IndexMonitorControl", true);
boolean showInit = env.getConfigBool("IndexMonitorInit", false);
boolean showExec = env.getConfigBool("IndexMonitorExec", false);
boolean showDate = env.getConfigBool("IndexMonitorDate", true);
boolean showWords = env.getConfigBool("IndexMonitorWords", true);
boolean showTitle = env.getConfigBool("IndexMonitorTitle", true);
boolean showURL = env.getConfigBool("IndexMonitorURL", true);
if (post == null) {
post = new serverObjects();
post.put("process", "0");
}
// find process number
int tabletype;
try {
tabletype = Integer.parseInt(post.get("process", "0"));
} catch (NumberFormatException e) {
tabletype = 0;
}
// check if authorization is needed and/or given
if (((tabletype > 0) && (tabletype < 6)) ||
(post.containsKey("clearlist")) ||
(post.containsKey("deleteentry"))) {
String authorization = ((String) header.get("Authorization", "xxxxxx"));
if (authorization.length() != 0) {
if (! sb.verifyAuthentication(header, true)){
// force log-in (again, because wrong password was given)
prop.put("AUTHENTICATE", "admin log-in");
return prop;
}
} else {
// force log-in
prop.put("AUTHENTICATE", "admin log-in");
return prop;
}
}
// custom number of lines
if (post.containsKey("count")) {
lines = Integer.parseInt(post.get("count", "500"));
}
// do the commands
if (post.containsKey("clearlist")) sb.wordIndex.loadedURL.clearStack(tabletype);
if (post.containsKey("deleteentry")) {
String hash = post.get("hash", null);
if (hash != null) {
// delete from database
sb.wordIndex.loadedURL.remove(hash);
}
}
if (post.containsKey("moreIndexed")) {
lines = Integer.parseInt(post.get("showIndexed", "500"));
}
if (post.get("sc") != null)
if (post.get("sc").equals("0")) showControl = false; else showControl = true;
if (post.get("si") != null)
if (post.get("si").equals("0")) showInit = false; else showInit = true;
if (post.get("se") != null)
if (post.get("se").equals("0")) showExec = false; else showExec = true;
if (post.get("sd") != null)
if (post.get("sd").equals("0")) showDate = false; else showDate = true;
if (post.get("sw") != null)
if (post.get("sw").equals("0")) showWords = false; else showWords = true;
if (post.get("st") != null)
if (post.get("st").equals("0")) showTitle = false; else showTitle = true;
if (post.get("su") != null)
if (post.get("su").equals("0")) showURL = false; else showURL = true;
// create table
if (tabletype == 0) {
prop.put("table", 2);
} else if (sb.wordIndex.loadedURL.getStackSize(tabletype) == 0) {
prop.put("table", 0);
} else {
prop.put("table", 1);
if (lines > sb.wordIndex.loadedURL.getStackSize(tabletype)) lines = sb.wordIndex.loadedURL.getStackSize(tabletype);
if (lines == sb.wordIndex.loadedURL.getStackSize(tabletype)) {
prop.put("table_size", 0);
} else {
prop.put("table_size", 1);
prop.put("table_size_count", lines);
}
prop.put("table_size_all", sb.wordIndex.loadedURL.getStackSize(tabletype));
if (showControl) {
prop.put("table_showControl", 1);
prop.put("table_showControl_feedbackpage", "CrawlResults.html");
prop.put("table_showControl_tabletype", tabletype);
} else
prop.put("table_showControl", 0);
prop.put("table_showInit", (showInit) ? 1 : 0);
prop.put("table_showExec", (showExec) ? 1 : 0);
prop.put("table_showDate", (showDate) ? 1 : 0);
prop.put("table_showWords", (showWords) ? 1 : 0);
prop.put("table_showTitle", (showTitle) ? 1 : 0);
prop.put("table_showURL", (showURL) ? 1 : 0);
boolean dark = true;
String urlHash, initiatorHash, executorHash;
String cachepath, urlstr, urltxt;
yacySeed initiatorSeed, executorSeed;
indexURLEntry urle;
// needed for getCachePath(url)
final plasmaHTCache cacheManager = sb.getCacheManager();
int i, cnt = 0;
for (i = sb.wordIndex.loadedURL.getStackSize(tabletype) - 1; i >= (sb.wordIndex.loadedURL.getStackSize(tabletype) - lines); i--) {
initiatorHash = sb.wordIndex.loadedURL.getInitiatorHash(tabletype, i);
executorHash = sb.wordIndex.loadedURL.getExecutorHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps initiatorHash=" + initiatorHash + " executorHash=" + executorHash);
urlHash = sb.wordIndex.loadedURL.getUrlHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash);
try {
urle = sb.wordIndex.loadedURL.load(urlHash, null);
indexURLEntry.Components comp = urle.comp();
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString());
initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash);
executorSeed = yacyCore.seedDB.getConnected(executorHash);
urlstr = comp.url().toNormalform(false, true);
urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
cachepath = cacheManager.getCachePath(new URL(urlstr)).toString().replace('\\', '/').substring(cacheManager.cachePath.toString().length() + 1);
prop.put("table_indexed_" + cnt + "_dark", (dark) ? 1 : 0);
if (showControl) {
prop.put("table_indexed_" + cnt + "_showControl", 1);
prop.put("table_indexed_" + cnt + "_showControl_feedbackpage", "CrawlResults.html");
prop.put("table_indexed_" + cnt + "_showControl_tabletype", tabletype);
prop.put("table_indexed_" + cnt + "_showControl_urlhash", urlHash);
} else
prop.put("table_indexed_" + cnt + "_showControl", 0);
if (showInit) {
prop.put("table_indexed_" + cnt + "_showInit", 1);
prop.put("table_indexed_" + cnt + "_showInit_initiatorSeed", (initiatorSeed == null) ? "unknown" : initiatorSeed.getName());
} else
prop.put("table_indexed_" + cnt + "_showInit", 0);
if (showExec) {
prop.put("table_indexed_" + cnt + "_showExec", 1);
prop.put("table_indexed_" + cnt + "_showExec_executorSeed", (executorSeed == null) ? "unknown" : executorSeed.getName());
} else
prop.put("table_indexed_" + cnt + "_showExec", 0);
if (showDate) {
prop.put("table_indexed_" + cnt + "_showDate", 1);
prop.put("table_indexed_" + cnt + "_showDate_modified", daydate(urle.moddate()));
} else
prop.put("table_indexed_" + cnt + "_showDate", 0);
if (showWords) {
prop.put("table_indexed_" + cnt + "_showWords", 1);
prop.put("table_indexed_" + cnt + "_showWords_count", urle.wordCount());
} else
prop.put("table_indexed_" + cnt + "_showWords", 0);
if (showTitle) {
prop.put("table_indexed_" + cnt + "_showTitle", (showTitle) ? 1 : 0);
if (cachepath == null) {
prop.put("table_indexed_" + cnt + "_showTitle_available", 0);
} else {
prop.put("table_indexed_" + cnt + "_showTitle_available", 1);
if (comp.title() == null || comp.title().trim().length() == 0)
prop.put("table_indexed_" + cnt + "_showTitle_available_nodescr", 0);
else
prop.put("table_indexed_" + cnt + "_showTitle_available_nodescr", 1);
prop.put("table_indexed_" + cnt + "_showTitle_available_nodescr_urldescr", comp.title());
prop.put("table_indexed_" + cnt + "_showTitle_available_cachepath", cachepath);
prop.put("table_indexed_" + cnt + "_showTitle_available_urltitle", urlstr);
}
} else
prop.put("table_indexed_" + cnt + "_showTitle", 0);
if (showURL) {
prop.put("table_indexed_" + cnt + "_showURL", 1);
if (cachepath == null) {
prop.put("table_indexed_" + cnt + "_showURL_available", 0);
} else {
prop.put("table_indexed_" + cnt + "_showURL_available", 1);
prop.put("table_indexed_" + cnt + "_showURL_available_cachepath", cachepath);
prop.put("table_indexed_" + cnt + "_showURL_available_urltitle", urlstr);
prop.put("table_indexed_" + cnt + "_showURL_available_url", urltxt);
}
} else
prop.put("table_indexed_" + cnt + "_showURL", 0);
dark = !dark;
cnt++;
} catch (Exception e) {
serverLog.logSevere("PLASMA", "genTableProps", e);
}
}
prop.put("table_indexed", cnt);
}
prop.put("process", tabletype);
// return rewrite properties
return prop;
}
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
private static String daydate(Date date) {
if (date == null) {
return "";
} else {
return dayFormatter.format(date);
}
}
}