- adding automatic refresh - accepts new parameter nameLookup which can be used to deactivate yacy-peer name lookup (because we have problems with this on large seed-dbs) *) ViewFile New page that can be used to view - original content - plain text content - parsed content - parsed sentences of a webpage specified by there url hash Mainly for debugging purpose at the moment *) Robots.txt Bugfix for if-modified-since usage TODO: synchronization of downloads to avoid loading the same robots-file multiple times in parallel by different threads *) Shutdown Better abortion of transferRWI and transferURL sessions on server shutdown *) Status Page Adding icon to start/stop crawling via status page git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@950 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
bcb0d6d5ff
commit
40777556c5
@ -0,0 +1,85 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<title>YaCy '#[clientname]#': View URL Content</title>
|
||||
#[metas]#
|
||||
</head>
|
||||
<body marginheight="0" marginwidth="0" leftmargin="0" topmargin="0">
|
||||
#[header]#
|
||||
<br><br>
|
||||
<h2>View URL Content</h2>
|
||||
|
||||
<p><font color="red">
|
||||
#(error)#
|
||||
<table border="0" cellpadding="2" cellspacing="1">
|
||||
<tr class="TableHeader">
|
||||
<td>URL</td>
|
||||
<td><a href="#[url]#">#[url]#</a></td>
|
||||
</tr>
|
||||
<tr class="TableCellDark">
|
||||
<td>Hash</td>
|
||||
<td><tt>#[hash]#</tt></td>
|
||||
</tr>
|
||||
<tr class="TableCellLight">
|
||||
<td>Word Count</td>
|
||||
<td><tt>#[wordCount]#</tt></td>
|
||||
</tr>
|
||||
<tr class="TableCellDark">
|
||||
<td>Description</td>
|
||||
<td><tt>#[desc]#</tt></td>
|
||||
</tr>
|
||||
<tr class="TableCellLight">
|
||||
<td>Size</td>
|
||||
<td><tt>#[size]#</tt></td>
|
||||
</tr>
|
||||
<tr class="TableCellDark">
|
||||
<td>View as:</td>
|
||||
<td>
|
||||
<a href="?urlHash=#[hash]#&viewMode=iframe">Original</a> |
|
||||
<a href="?urlHash=#[hash]#&viewMode=plain">Plain Text</a> |
|
||||
<a href="?urlHash=#[hash]#&viewMode=parsed">Parsed Text</a> |
|
||||
<a href="?urlHash=#[hash]#&viewMode=sentences">Parsed Sentences</a>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
:: <!-- 1 -->
|
||||
No URL hash submitted.
|
||||
:: <!-- 2 -->
|
||||
Unable to find URL Entry in DB
|
||||
:: <!-- 3 -->
|
||||
Invalid URL
|
||||
:: <!-- 4 -->
|
||||
Unable to download resource content.
|
||||
:: <!-- 5 -->
|
||||
Unable to parse resource content.
|
||||
#(/error)#
|
||||
</font>
|
||||
</p>
|
||||
<p>
|
||||
#(viewMode)#
|
||||
:: <!-- 1 -->
|
||||
<h3>Plain Resource Content</h3><br>
|
||||
<tt>#[plainText]#</tt>
|
||||
:: <!-- 2 -->
|
||||
<h3>Parsed Resource Content</h3><br>
|
||||
<tt>#[parsedText]#</tt>
|
||||
:: <!-- 3 -->
|
||||
<h3>Parsed Resource Sentences</h3><br>
|
||||
<table border="0" cellpadding="2" cellspacing="1">
|
||||
#{sentences}#
|
||||
<tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#" title="#[sessionName]#">
|
||||
<td>#[nr]#</td>
|
||||
<td><tt>#[text]#</tt></td>
|
||||
</tr>
|
||||
#{/sentences}#
|
||||
</table>
|
||||
:: <!-- 4 -->
|
||||
<h3>Original Resource Content</h3><br>
|
||||
<iframe src="#[url]#" width="800" height="400">
|
||||
</iframe>
|
||||
#(/viewMode)#
|
||||
</p>
|
||||
|
||||
#[footer]#
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,173 @@
|
||||
//ViewFile.java
|
||||
//-----------------------
|
||||
//part of YaCy
|
||||
//(C) by Michael Peter Christen; mc@anomic.de
|
||||
//first published on http://www.anomic.de
|
||||
//Frankfurt, Germany, 2004
|
||||
//
|
||||
//last major change: 12.07.2004
|
||||
//
|
||||
//This program is free software; you can redistribute it and/or modify
|
||||
//it under the terms of the GNU General Public License as published by
|
||||
//the Free Software Foundation; either version 2 of the License, or
|
||||
//(at your option) any later version.
|
||||
//
|
||||
//This program is distributed in the hope that it will be useful,
|
||||
//but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
//GNU General Public License for more details.
|
||||
//
|
||||
//You should have received a copy of the GNU General Public License
|
||||
//along with this program; if not, write to the Free Software
|
||||
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
//Using this software in any meaning (reading, learning, copying, compiling,
|
||||
//running) means that you agree that the Author(s) is (are) not responsible
|
||||
//for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
//by usage of this softare or this documentation. The usage of this software
|
||||
//is on your own risk. The installation and usage (starting/running) of this
|
||||
//software may allow other people or application to access your computer and
|
||||
//any attached devices and is highly dependent on the configuration of the
|
||||
//software which must be done by the user of the software; the author(s) is
|
||||
//(are) also not responsible for proper configuration and usage of the
|
||||
//software, even if provoked by documentation provided together with
|
||||
//the software.
|
||||
//
|
||||
//Any changes to this file according to the GPL as documented in the file
|
||||
//gpl.txt aside this file in the shipment you received can be done to the
|
||||
//lines that follows this copyright notice here, but changes must not be
|
||||
//done inside the copyright notive above. A re-distribution must contain
|
||||
//the intact and unchanged copyright notice.
|
||||
//Contributions and changes to the program code must be marked as such.
|
||||
|
||||
//you must compile this file with
|
||||
//javac -classpath .:../Classes Status.java
|
||||
//if the shell's current path is HTROOT
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URL;
|
||||
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.plasma.plasmaParserDocument;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.plasmaCrawlLURL.Entry;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
|
||||
public class ViewFile {
|
||||
|
||||
public static final int VIEW_MODE_NO_TEXT = 0;
|
||||
public static final int VIEW_MODE_AS_PLAIN_TEXT = 1;
|
||||
public static final int VIEW_MODE_AS_PARSED_TEXT = 2;
|
||||
public static final int VIEW_MODE_AS_PARSED_SENTENCES = 3;
|
||||
public static final int VIEW_MODE_AS_IFRAME = 4;
|
||||
|
||||
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
|
||||
|
||||
serverObjects prop = new serverObjects();
|
||||
plasmaSwitchboard sb = (plasmaSwitchboard)env;
|
||||
|
||||
if (post != null) {
|
||||
// getting the url hash from which the content should be loaded
|
||||
String urlHash = post.get("urlHash","");
|
||||
if (urlHash.equals("")) {
|
||||
prop.put("error",1);
|
||||
prop.put("viewMode",VIEW_MODE_NO_TEXT);
|
||||
return prop;
|
||||
}
|
||||
|
||||
String viewMode = post.get("viewMode","plain");
|
||||
|
||||
// getting the urlEntry that belongs to the url hash
|
||||
Entry urlEntry = sb.urlPool.loadedURL.getEntry(urlHash);
|
||||
if (urlEntry == null) {
|
||||
prop.put("error",2);
|
||||
prop.put("viewMode",VIEW_MODE_NO_TEXT);
|
||||
return prop;
|
||||
}
|
||||
|
||||
// gettin the url that belongs to the entry
|
||||
URL url = urlEntry.url();
|
||||
if (url == null) {
|
||||
prop.put("error",3);
|
||||
prop.put("viewMode",VIEW_MODE_NO_TEXT);
|
||||
return prop;
|
||||
}
|
||||
|
||||
// loading the resource content as byte array
|
||||
byte[] resource = null;
|
||||
try {
|
||||
resource = sb.cacheManager.loadResource(url);
|
||||
if (resource == null) {
|
||||
sb.snippetCache.loadResourceFromWeb(url, 5000);
|
||||
|
||||
resource = sb.cacheManager.loadResource(url);
|
||||
if (resource == null) {
|
||||
prop.put("error",4);
|
||||
prop.put("viewMode",VIEW_MODE_NO_TEXT);
|
||||
return prop;
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
if (url == null) {
|
||||
prop.put("error",4);
|
||||
prop.put("viewMode",VIEW_MODE_NO_TEXT);
|
||||
return prop;
|
||||
}
|
||||
}
|
||||
if (viewMode.equals("plain")) {
|
||||
String content = new String(resource);
|
||||
content = content.replaceAll("<","<")
|
||||
.replaceAll(">",">")
|
||||
.replaceAll("\"",""")
|
||||
.replaceAll("\n","<br>")
|
||||
.replaceAll("\t"," ");
|
||||
|
||||
prop.put("error",0);
|
||||
prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT);
|
||||
prop.put("viewMode_plainText",content);
|
||||
} else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) {
|
||||
// parsing the resource content
|
||||
plasmaParserDocument document = sb.snippetCache.parseDocument(url, resource);
|
||||
if (document == null) {
|
||||
prop.put("error",5);
|
||||
prop.put("viewMode",VIEW_MODE_NO_TEXT);
|
||||
return prop;
|
||||
}
|
||||
|
||||
if (viewMode.equals("parsed")) {
|
||||
String content = new String(document.getText());
|
||||
content = content.replaceAll("\n","<br>")
|
||||
.replaceAll("\t"," ");
|
||||
|
||||
prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT);
|
||||
prop.put("viewMode_parsedText",content);
|
||||
} else if (viewMode.equals("iframe")) {
|
||||
prop.put("viewMode",VIEW_MODE_AS_IFRAME);
|
||||
prop.put("viewMode_url",url.toString());
|
||||
} else {
|
||||
prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
|
||||
String[] sentences = document.getSentences();
|
||||
|
||||
boolean dark = true;
|
||||
for (int i=0; i < sentences.length; i++) {
|
||||
prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1));
|
||||
prop.put("viewMode_sentences_" + i + "_text",sentences[i]);
|
||||
prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark;
|
||||
}
|
||||
prop.put("viewMode_sentences",sentences.length);
|
||||
|
||||
}
|
||||
}
|
||||
prop.put("error",0);
|
||||
prop.put("error_url",url.toString());
|
||||
prop.put("error_hash",urlHash);
|
||||
prop.put("error_wordCount",Integer.toString(urlEntry.wordCount()));
|
||||
prop.put("error_desc",urlEntry.descr());
|
||||
prop.put("error_size",urlEntry.size());
|
||||
}
|
||||
|
||||
return prop;
|
||||
}
|
||||
|
||||
}
|
After Width: | Height: | Size: 88 B |
After Width: | Height: | Size: 90 B |
Loading…
Reference in new issue