- specified exceptions thrown by ResourceInfoFactory and plasmaHTCache.loadResourceInfo()

- caught possible NPE in CacheAdmin_p and added more error-cases
- speeded up deletion of entries in the local crawl queue by crawl profile (it has been noted often that this deletion is slow)
- added a bit javadoc

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3868 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
karlchenofhell 18 years ago
parent dfd5e823c3
commit 22ee85ca02

@ -96,7 +96,12 @@
<span style="display: block;">#[line]#</span>#{/lines}#
</span>
::<span class="error">- This file is not cached -</span>
::<img src="CacheResource_p.html?path=#[src]#" alt="Cached image from #[src]#" />#(/type)#
::<img src="CacheResource_p.html?path=#[src]#" alt="Cached image from #[src]#" />
::<span class="error">- The protocol #[protoc]# is not supported by YaCy</span>
::<span class="error">
- IllegalAccessException -Security Manager is blocking dynamic class loading
but should not be active. Please report this incident!
</span>#(/type)#
<!-- TO-DO: CSS/XHTMLize end -->
</div>::
<div class="CacheAdminTree">

@ -52,6 +52,7 @@
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.Writer;
import java.util.Iterator;
import java.util.Map;
@ -66,6 +67,7 @@ import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.cache.UnsupportedProtocolException;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -80,6 +82,8 @@ public class CacheAdmin_p {
private static final int HtmlFile = 0;
private static final int NotCached = 1;
private static final int Image = 2;
private static final int ProtocolError = 3;
private static final int SecurityError = 4;
public static final class Filter implements FilenameFilter {
private static final String EXCLUDE_NAME = plasmaHTCache.DB_NAME;
@ -105,7 +109,7 @@ public class CacheAdmin_p {
pathString = "/";
file = new File(switchboard.htCachePath, pathString);
}
} catch (Exception e) {
} catch (IOException e) {
pathString = "/";
file = new File(switchboard.htCachePath, pathString);
}
@ -129,62 +133,69 @@ public class CacheAdmin_p {
info.ensureCapacity(10000);
try {
final IResourceInfo resInfo = switchboard.cacheManager.loadResourceInfo(url);
formatHeader(prop, resInfo.getMap());
final String ff = file.toString();
final int dotpos = ff.lastIndexOf('.');
final String ext = (dotpos >= 0) ? ff.substring(dotpos + 1).toLowerCase() : "";
if (ext.equals("gif") || ext.equals("jpg") ||
ext.equals("png") || ext.equals("jpeg")) {
prop.put("info_type", Image);
prop.put("info_type_src", pathString);
if (resInfo == null) {
prop.put("info_type", NotCached);
} else {
prop.put("info_type", HtmlFile);
// fill the htmlFilerContentScraper object with the contents of the cached file
// to retrieve all needed information
final htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
//final OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
Writer writer = new htmlFilterWriter(null,null,scraper,null,false);
String sourceCharset = resInfo.getCharacterEncoding();
if (sourceCharset == null) sourceCharset = "UTF-8";
String mimeType = resInfo.getMimeType();
serverFileUtils.copy(file, sourceCharset, writer);
writer.close();
final plasmaParserDocument document = switchboard.parser.transformScraper(url, mimeType, sourceCharset, scraper);
prop.put("info_type_title", scraper.getTitle());
int i;
String[] t = document.getSectionTitles();
prop.put("info_type_headlines", t.length);
for (i = 0; i < t.length; i++)
prop.put("info_type_headlines_" + i + "_headline",
t[i].replaceAll("\n", "").trim());
formatAnchor(prop, document.getHyperlinks(), "links");
formatImageAnchor(prop, document.getImages());
formatAnchor(prop, document.getAudiolinks(), "audio");
formatAnchor(prop, document.getVideolinks(), "video");
formatAnchor(prop, document.getApplinks(), "apps");
formatAnchor(prop, document.getEmaillinks(), "email");
formatHeader(prop, resInfo.getMap());
prop.put("info_type_text",
de.anomic.data.htmlTools.replaceXMLEntities(new String(scraper.getText())));
i = 0;
final Iterator sentences = document.getSentences(false);
if (sentences != null)
while (sentences.hasNext()) {
prop.put("info_type_lines_" + i + "_line",
new String((StringBuffer) sentences.next()).replaceAll("\n", "").trim());
i++;
}
prop.put("info_type_lines", i);
if (document != null) document.close();
final String ff = file.toString();
final int dotpos = ff.lastIndexOf('.');
final String ext = (dotpos >= 0) ? ff.substring(dotpos + 1).toLowerCase() : "";
if (ext.equals("gif") || ext.equals("jpg") ||
ext.equals("png") || ext.equals("jpeg")) {
prop.put("info_type", Image);
prop.put("info_type_src", pathString);
} else {
prop.put("info_type", HtmlFile);
// fill the htmlFilerContentScraper object with the contents of the cached file
// to retrieve all needed information
final htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
//final OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
Writer writer = new htmlFilterWriter(null,null,scraper,null,false);
String sourceCharset = resInfo.getCharacterEncoding();
if (sourceCharset == null) sourceCharset = "UTF-8";
String mimeType = resInfo.getMimeType();
serverFileUtils.copy(file, sourceCharset, writer);
writer.close();
final plasmaParserDocument document = switchboard.parser.transformScraper(url, mimeType, sourceCharset, scraper);
prop.put("info_type_title", scraper.getTitle());
int i;
String[] t = document.getSectionTitles();
prop.put("info_type_headlines", t.length);
for (i = 0; i < t.length; i++)
prop.put("info_type_headlines_" + i + "_headline",
t[i].replaceAll("\n", "").trim());
formatAnchor(prop, document.getHyperlinks(), "links");
formatImageAnchor(prop, document.getImages());
formatAnchor(prop, document.getAudiolinks(), "audio");
formatAnchor(prop, document.getVideolinks(), "video");
formatAnchor(prop, document.getApplinks(), "apps");
formatAnchor(prop, document.getEmaillinks(), "email");
prop.put("info_type_text", new String(scraper.getText()));
i = 0;
final Iterator sentences = document.getSentences(false);
if (sentences != null)
while (sentences.hasNext()) {
prop.put("info_type_lines_" + i + "_line",
new String((StringBuffer) sentences.next()).replaceAll("\n", "").trim());
i++;
}
prop.put("info_type_lines", i);
if (document != null) document.close();
}
}
} catch (Exception e) {
} catch (IOException e) {
prop.put("info_type", NotCached);
} catch (UnsupportedProtocolException e) {
prop.put("info_type", ProtocolError);
} catch (IllegalAccessException e) {
prop.put("info_type", SecurityError);
}
} else {
prop.put("info", TypeDIR);
@ -234,11 +245,12 @@ public class CacheAdmin_p {
}
}
}
prop.put("cachesize", Long.toString(switchboard.cacheManager.curCacheSize/1024));
prop.put("cachemax", Long.toString(switchboard.cacheManager.maxCacheSize/1024));
prop.put("path", path.toString());
prop.put("info_info", info.toString());
/* prop.put("info_tree", tree.toString()); */
// return rewrite properties
return prop;

@ -21,12 +21,12 @@
Delete Entries:
<input type="text" name="pattern" value=".*" size="20" maxlength="200" />
<select name="option" size="1">
<option value="Initiator">Initiator</option>
<option value="Profile">Profile</option>
<option value="Depth">Depth</option>
<option value="ModifiedDate">Modified Date</option>
<option value="AnchorName">Anchor Name</option>
<option value="URL" selected="selected">URL</option>
<option value="5">Initiator</option>
<option value="3">Profile</option>
<option value="4">Depth</option>
<option value="6">Modified Date</option>
<option value="2">Anchor Name</option>
<option value="1" selected="selected">URL</option>
</select>
<input type="submit" name="deleteEntries" value="Delete" /><em>This may take a quite long time.</em>
</fieldset>

@ -70,6 +70,14 @@ public class IndexCreateWWWLocalQueue_p {
return dayFormatter.format(date);
}
private static final int INVALID = 0;
private static final int URL = 1;
private static final int ANCHOR = 2;
private static final int PROFILE = 3;
private static final int DEPTH = 4;
private static final int INITIATOR = 5;
private static final int MODIFIED = 6;
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
// return variable that accumulates replacements
plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
@ -87,55 +95,59 @@ public class IndexCreateWWWLocalQueue_p {
int c = 0;
String pattern = post.get("pattern", ".*").trim();
String option = post.get("option", ".*").trim();
final int option = post.getInt("option", INVALID);
if (pattern.equals(".*")) {
c = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
switchboard.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE);
try { switchboard.cleanProfiles(); } catch (InterruptedException e) {/* ignore this */}
} else{
} else if (option > INVALID) {
Pattern compiledPattern = null;
try {
// compiling the regular expression
compiledPattern = Pattern.compile(pattern);
// iterating through the list of URLs
Iterator iter = switchboard.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
plasmaCrawlEntry entry;
while (iter.hasNext()) {
if ((entry = (plasmaCrawlEntry) iter.next()) == null) continue;
String value = null;
String nextHash = entry.urlhash();
if ((option.equals("URL")&&(entry.url() != null))) {
value = entry.url().toString();
} else if ((option.equals("AnchorName"))) {
value = entry.name();
} else if ((option.equals("Profile"))) {
String profileHandle = entry.profileHandle();
if (profileHandle == null) {
value = "unknown";
} else {
plasmaCrawlProfile.entry profile = switchboard.profiles.getEntry(profileHandle);
if (profile == null) {
value = "unknown";
} else {
value = profile.name();
}
}
} else if ((option.equals("Depth"))) {
value = Integer.toString(entry.depth());
} else if ((option.equals("Initiator"))) {
value = (entry.initiator()==null)?"proxy":htmlTools.replaceHTML(entry.initiator());
} else if ((option.equals("ModifiedDate"))) {
value = daydate(entry.loaddate());
if (option == PROFILE) {
// search and delete the crawl profile (_much_ faster, independant of queue size)
// XXX: what to do about the annoying LOST PROFILE messages in the log?
Iterator it = switchboard.profiles.profiles(true);
plasmaCrawlProfile.entry entry;
while (it.hasNext()) {
entry = (plasmaCrawlProfile.entry)it.next();
final String name = entry.name();
if (name.equals(plasmaSwitchboard.CRAWL_PROFILE_PROXY) ||
name.equals(plasmaSwitchboard.CRAWL_PROFILE_REMOTE) ||
name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_TEXT) ||
name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA))
continue;
if (compiledPattern.matcher(name).find())
switchboard.profiles.removeEntry(entry.handle());
}
if (value != null) {
Matcher matcher = compiledPattern.matcher(value);
if (matcher.find()) {
switchboard.noticeURL.remove(nextHash);
}
} else {
// iterating through the list of URLs
Iterator iter = switchboard.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
plasmaCrawlEntry entry;
while (iter.hasNext()) {
if ((entry = (plasmaCrawlEntry) iter.next()) == null) continue;
String value = null;
switch (option) {
case URL: value = (entry.url() == null) ? null : entry.url().toString(); break;
case ANCHOR: value = entry.name(); break;
case DEPTH: value = Integer.toString(entry.depth()); break;
case INITIATOR:
value = (entry.initiator() == null) ? "proxy" : htmlTools.replaceHTML(entry.initiator());
break;
case MODIFIED: value = daydate(entry.loaddate()); break;
default: value = null;
}
if (value != null) {
Matcher matcher = compiledPattern.matcher(value);
if (matcher.find()) {
switchboard.noticeURL.remove(entry.urlhash());
}
}
}
}
} catch (PatternSyntaxException e) {
e.printStackTrace();

@ -57,7 +57,7 @@ public class ResourceInfoFactory {
public IResourceInfo buildResourceInfoObj(
URL resourceURL,
Map resourceMetadata
) throws Exception {
) throws UnsupportedProtocolException, IllegalAccessException {
String protocString = resourceURL.getProtocol();
@ -65,25 +65,37 @@ public class ResourceInfoFactory {
if (protocString.equals("https")) protocString = "http";
// the full qualified class name
String className = this.getClass().getPackage().getName() + "." + protocString + ".ResourceInfo";
// loading class by name
Class moduleClass = Class.forName(className);
// getting the constructor
Constructor classConstructor = moduleClass.getConstructor( new Class[] {
URL.class,
Map.class
} );
// instantiating class
IResourceInfo infoObject = (IResourceInfo) classConstructor.newInstance(new Object[] {
resourceURL,
resourceMetadata
});
// return the newly created object
return infoObject;
final String className = this.getClass().getPackage().getName() + "." + protocString + ".ResourceInfo";
try {
// loading class by name
final Class moduleClass = Class.forName(className);
// getting the constructor
final Constructor classConstructor = moduleClass.getConstructor( new Class[] {
URL.class,
Map.class
} );
// instantiating class
final IResourceInfo infoObject = (IResourceInfo) classConstructor.newInstance(new Object[] {
resourceURL,
resourceMetadata
});
// return the newly created object
return infoObject;
} catch (Exception e) {
if (e instanceof RuntimeException) {
throw (RuntimeException)e;
} else if (e instanceof ClassNotFoundException) {
throw new UnsupportedProtocolException(protocString, e);
} else if (e instanceof IllegalAccessException) {
throw (IllegalAccessException)e;
} else {
e.printStackTrace();
return null;
}
}
}
}

@ -0,0 +1,21 @@
package de.anomic.plasma.cache;
/**
* This exception is thrown when a protocol (or a derivative using this protocol) is not
* supported, as is the case in the {@link ResourceInfoFactory}.
* @see package {@link de.anomic.plasma.cache} for all {@link IResourceInfo}s available
*/
public class UnsupportedProtocolException extends Exception {
private static final long serialVersionUID = 1L;
public static final String MESSAGE = "Unsupported protocol error: ";
public UnsupportedProtocolException(String protocol) {
super(MESSAGE + protocol);
}
public UnsupportedProtocolException(String protocol, Throwable cause) {
super(MESSAGE + protocol, cause);
}
}

@ -82,6 +82,7 @@ import de.anomic.kelondro.kelondroMapObjects;
import de.anomic.net.URL;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.cache.ResourceInfoFactory;
import de.anomic.plasma.cache.UnsupportedProtocolException;
import de.anomic.server.serverCodings;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverInstantThread;
@ -496,11 +497,14 @@ public final class plasmaHTCache {
/**
* Returns an object containing metadata about a cached resource
* @param url the url of the resource
* @return an {@link IResourceInfo info object}
* @throws Exception of the info object could not be created, e.g. if the protocol is not supported
* @param url the {@link URL} of the resource
* @return an {@link IResourceInfo info object}
* @throws <b>IllegalAccessException</b> if the {@link SecurityManager} doesn't allow instantiation
* of the info object with the given protocol
* @throws <b>UnsupportedProtocolException</b> if the protocol is not supported and therefore the
* info object couldn't be created
*/
public IResourceInfo loadResourceInfo(URL url) throws Exception {
public IResourceInfo loadResourceInfo(URL url) throws UnsupportedProtocolException, IllegalAccessException {
// getting the URL hash
String urlHash = plasmaURL.urlHash(url.toNormalform());

@ -1510,6 +1510,28 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
initProfiles();
}
/**
* {@link plasmaCrawlProfile Crawl Profiles} are saved independantly from the queues themselves
* and therefore have to be cleaned up from time to time. This method only performs the clean-up
* if - and only if - the {@link plasmaSwitchboardQueue switchboard},
* {@link plasmaCrawlLoader loader} and {@link plasmaCrawlNURL local crawl} queues are all empty.
* <p>
* Then it iterates through all existing {@link plasmaCrawlProfile crawl profiles} and removes
* all profiles which are not hardcoded.
* </p>
* <p>
* <i>If this method encounters DB-failures, the profile DB will be resetted and</i>
* <code>true</code><i> will be returned</i>
* </p>
* @see #CRAWL_PROFILE_PROXY hardcoded
* @see #CRAWL_PROFILE_REMOTE hardcoded
* @see #CRAWL_PROFILE_SNIPPET_TEXT hardcoded
* @see #CRAWL_PROFILE_SNIPPET_MEDIA hardcoded
* @return whether this method has done something or not (i.e. because the queues have been filled
* or there are no profiles left to clean up)
* @throws <b>InterruptedException</b> if the current thread has been interrupted, i.e. by the
* shutdown procedure
*/
public boolean cleanProfiles() throws InterruptedException {
if ((sbQueue.size() > 0) || (cacheLoader.size() > 0) || (noticeURL.stackSize() > 0)) return false;
final Iterator iter = profiles.profiles(true);

Loading…
Cancel
Save