diff --git a/htroot/CacheAdmin_p.html b/htroot/CacheAdmin_p.html index 21e77006c..eafa7464d 100644 --- a/htroot/CacheAdmin_p.html +++ b/htroot/CacheAdmin_p.html @@ -96,7 +96,12 @@ #[line]##{/lines}# ::- This file is not cached - - ::Cached image from #[src]##(/type)# + ::Cached image from #[src]# + ::- The protocol #[protoc]# is not supported by YaCy + :: + - IllegalAccessException -Security Manager is blocking dynamic class loading + but should not be active. Please report this incident! + #(/type)# ::
diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java index da1b691ac..8f0494c2b 100644 --- a/htroot/CacheAdmin_p.java +++ b/htroot/CacheAdmin_p.java @@ -52,6 +52,7 @@ import java.io.File; import java.io.FilenameFilter; +import java.io.IOException; import java.io.Writer; import java.util.Iterator; import java.util.Map; @@ -66,6 +67,7 @@ import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.cache.IResourceInfo; +import de.anomic.plasma.cache.UnsupportedProtocolException; import de.anomic.server.serverFileUtils; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -80,6 +82,8 @@ public class CacheAdmin_p { private static final int HtmlFile = 0; private static final int NotCached = 1; private static final int Image = 2; + private static final int ProtocolError = 3; + private static final int SecurityError = 4; public static final class Filter implements FilenameFilter { private static final String EXCLUDE_NAME = plasmaHTCache.DB_NAME; @@ -105,7 +109,7 @@ public class CacheAdmin_p { pathString = "/"; file = new File(switchboard.htCachePath, pathString); } - } catch (Exception e) { + } catch (IOException e) { pathString = "/"; file = new File(switchboard.htCachePath, pathString); } @@ -129,62 +133,69 @@ public class CacheAdmin_p { info.ensureCapacity(10000); try { final IResourceInfo resInfo = switchboard.cacheManager.loadResourceInfo(url); - formatHeader(prop, resInfo.getMap()); - - final String ff = file.toString(); - final int dotpos = ff.lastIndexOf('.'); - final String ext = (dotpos >= 0) ? ff.substring(dotpos + 1).toLowerCase() : ""; - if (ext.equals("gif") || ext.equals("jpg") || - ext.equals("png") || ext.equals("jpeg")) { - prop.put("info_type", Image); - prop.put("info_type_src", pathString); + if (resInfo == null) { + prop.put("info_type", NotCached); } else { - prop.put("info_type", HtmlFile); - // fill the htmlFilerContentScraper object with the contents of the cached file - // to retrieve all needed information - final htmlFilterContentScraper scraper = new htmlFilterContentScraper(url); - //final OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); - Writer writer = new htmlFilterWriter(null,null,scraper,null,false); - String sourceCharset = resInfo.getCharacterEncoding(); - if (sourceCharset == null) sourceCharset = "UTF-8"; - String mimeType = resInfo.getMimeType(); - serverFileUtils.copy(file, sourceCharset, writer); - writer.close(); - - final plasmaParserDocument document = switchboard.parser.transformScraper(url, mimeType, sourceCharset, scraper); - - prop.put("info_type_title", scraper.getTitle()); - - int i; - String[] t = document.getSectionTitles(); - prop.put("info_type_headlines", t.length); - for (i = 0; i < t.length; i++) - prop.put("info_type_headlines_" + i + "_headline", - t[i].replaceAll("\n", "").trim()); - - formatAnchor(prop, document.getHyperlinks(), "links"); - formatImageAnchor(prop, document.getImages()); - formatAnchor(prop, document.getAudiolinks(), "audio"); - formatAnchor(prop, document.getVideolinks(), "video"); - formatAnchor(prop, document.getApplinks(), "apps"); - formatAnchor(prop, document.getEmaillinks(), "email"); + formatHeader(prop, resInfo.getMap()); - prop.put("info_type_text", - de.anomic.data.htmlTools.replaceXMLEntities(new String(scraper.getText()))); - - i = 0; - final Iterator sentences = document.getSentences(false); - if (sentences != null) - while (sentences.hasNext()) { - prop.put("info_type_lines_" + i + "_line", - new String((StringBuffer) sentences.next()).replaceAll("\n", "").trim()); - i++; - } - prop.put("info_type_lines", i); - if (document != null) document.close(); + final String ff = file.toString(); + final int dotpos = ff.lastIndexOf('.'); + final String ext = (dotpos >= 0) ? ff.substring(dotpos + 1).toLowerCase() : ""; + if (ext.equals("gif") || ext.equals("jpg") || + ext.equals("png") || ext.equals("jpeg")) { + prop.put("info_type", Image); + prop.put("info_type_src", pathString); + } else { + prop.put("info_type", HtmlFile); + // fill the htmlFilerContentScraper object with the contents of the cached file + // to retrieve all needed information + final htmlFilterContentScraper scraper = new htmlFilterContentScraper(url); + //final OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); + Writer writer = new htmlFilterWriter(null,null,scraper,null,false); + String sourceCharset = resInfo.getCharacterEncoding(); + if (sourceCharset == null) sourceCharset = "UTF-8"; + String mimeType = resInfo.getMimeType(); + serverFileUtils.copy(file, sourceCharset, writer); + writer.close(); + + final plasmaParserDocument document = switchboard.parser.transformScraper(url, mimeType, sourceCharset, scraper); + + prop.put("info_type_title", scraper.getTitle()); + + int i; + String[] t = document.getSectionTitles(); + prop.put("info_type_headlines", t.length); + for (i = 0; i < t.length; i++) + prop.put("info_type_headlines_" + i + "_headline", + t[i].replaceAll("\n", "").trim()); + + formatAnchor(prop, document.getHyperlinks(), "links"); + formatImageAnchor(prop, document.getImages()); + formatAnchor(prop, document.getAudiolinks(), "audio"); + formatAnchor(prop, document.getVideolinks(), "video"); + formatAnchor(prop, document.getApplinks(), "apps"); + formatAnchor(prop, document.getEmaillinks(), "email"); + + prop.put("info_type_text", new String(scraper.getText())); + + i = 0; + final Iterator sentences = document.getSentences(false); + if (sentences != null) + while (sentences.hasNext()) { + prop.put("info_type_lines_" + i + "_line", + new String((StringBuffer) sentences.next()).replaceAll("\n", "").trim()); + i++; + } + prop.put("info_type_lines", i); + if (document != null) document.close(); + } } - } catch (Exception e) { + } catch (IOException e) { prop.put("info_type", NotCached); + } catch (UnsupportedProtocolException e) { + prop.put("info_type", ProtocolError); + } catch (IllegalAccessException e) { + prop.put("info_type", SecurityError); } } else { prop.put("info", TypeDIR); @@ -234,11 +245,12 @@ public class CacheAdmin_p { } } } - + prop.put("cachesize", Long.toString(switchboard.cacheManager.curCacheSize/1024)); prop.put("cachemax", Long.toString(switchboard.cacheManager.maxCacheSize/1024)); prop.put("path", path.toString()); prop.put("info_info", info.toString()); + /* prop.put("info_tree", tree.toString()); */ // return rewrite properties return prop; diff --git a/htroot/IndexCreateWWWLocalQueue_p.html b/htroot/IndexCreateWWWLocalQueue_p.html index be7bb9d21..9acc906f5 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.html +++ b/htroot/IndexCreateWWWLocalQueue_p.html @@ -21,12 +21,12 @@ Delete Entries: This may take a quite long time. diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateWWWLocalQueue_p.java index 89f46a70e..8d30e2be9 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.java +++ b/htroot/IndexCreateWWWLocalQueue_p.java @@ -70,6 +70,14 @@ public class IndexCreateWWWLocalQueue_p { return dayFormatter.format(date); } + private static final int INVALID = 0; + private static final int URL = 1; + private static final int ANCHOR = 2; + private static final int PROFILE = 3; + private static final int DEPTH = 4; + private static final int INITIATOR = 5; + private static final int MODIFIED = 6; + public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { // return variable that accumulates replacements plasmaSwitchboard switchboard = (plasmaSwitchboard) env; @@ -87,55 +95,59 @@ public class IndexCreateWWWLocalQueue_p { int c = 0; String pattern = post.get("pattern", ".*").trim(); - String option = post.get("option", ".*").trim(); + final int option = post.getInt("option", INVALID); if (pattern.equals(".*")) { c = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE); switchboard.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE); try { switchboard.cleanProfiles(); } catch (InterruptedException e) {/* ignore this */} - } else{ + } else if (option > INVALID) { Pattern compiledPattern = null; try { // compiling the regular expression compiledPattern = Pattern.compile(pattern); - // iterating through the list of URLs - Iterator iter = switchboard.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE); - plasmaCrawlEntry entry; - while (iter.hasNext()) { - if ((entry = (plasmaCrawlEntry) iter.next()) == null) continue; - String value = null; - String nextHash = entry.urlhash(); - if ((option.equals("URL")&&(entry.url() != null))) { - value = entry.url().toString(); - } else if ((option.equals("AnchorName"))) { - value = entry.name(); - } else if ((option.equals("Profile"))) { - String profileHandle = entry.profileHandle(); - if (profileHandle == null) { - value = "unknown"; - } else { - plasmaCrawlProfile.entry profile = switchboard.profiles.getEntry(profileHandle); - if (profile == null) { - value = "unknown"; - } else { - value = profile.name(); - } - } - } else if ((option.equals("Depth"))) { - value = Integer.toString(entry.depth()); - } else if ((option.equals("Initiator"))) { - value = (entry.initiator()==null)?"proxy":htmlTools.replaceHTML(entry.initiator()); - } else if ((option.equals("ModifiedDate"))) { - value = daydate(entry.loaddate()); + if (option == PROFILE) { + // search and delete the crawl profile (_much_ faster, independant of queue size) + // XXX: what to do about the annoying LOST PROFILE messages in the log? + Iterator it = switchboard.profiles.profiles(true); + plasmaCrawlProfile.entry entry; + while (it.hasNext()) { + entry = (plasmaCrawlProfile.entry)it.next(); + final String name = entry.name(); + if (name.equals(plasmaSwitchboard.CRAWL_PROFILE_PROXY) || + name.equals(plasmaSwitchboard.CRAWL_PROFILE_REMOTE) || + name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_TEXT) || + name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA)) + continue; + if (compiledPattern.matcher(name).find()) + switchboard.profiles.removeEntry(entry.handle()); } - - if (value != null) { - Matcher matcher = compiledPattern.matcher(value); - if (matcher.find()) { - switchboard.noticeURL.remove(nextHash); - } + } else { + // iterating through the list of URLs + Iterator iter = switchboard.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE); + plasmaCrawlEntry entry; + while (iter.hasNext()) { + if ((entry = (plasmaCrawlEntry) iter.next()) == null) continue; + String value = null; + + switch (option) { + case URL: value = (entry.url() == null) ? null : entry.url().toString(); break; + case ANCHOR: value = entry.name(); break; + case DEPTH: value = Integer.toString(entry.depth()); break; + case INITIATOR: + value = (entry.initiator() == null) ? "proxy" : htmlTools.replaceHTML(entry.initiator()); + break; + case MODIFIED: value = daydate(entry.loaddate()); break; + default: value = null; + } + + if (value != null) { + Matcher matcher = compiledPattern.matcher(value); + if (matcher.find()) { + switchboard.noticeURL.remove(entry.urlhash()); + } + } } - } } catch (PatternSyntaxException e) { e.printStackTrace(); diff --git a/source/de/anomic/plasma/cache/ResourceInfoFactory.java b/source/de/anomic/plasma/cache/ResourceInfoFactory.java index 6e8a220f2..54f281dc4 100644 --- a/source/de/anomic/plasma/cache/ResourceInfoFactory.java +++ b/source/de/anomic/plasma/cache/ResourceInfoFactory.java @@ -57,7 +57,7 @@ public class ResourceInfoFactory { public IResourceInfo buildResourceInfoObj( URL resourceURL, Map resourceMetadata - ) throws Exception { + ) throws UnsupportedProtocolException, IllegalAccessException { String protocString = resourceURL.getProtocol(); @@ -65,25 +65,37 @@ public class ResourceInfoFactory { if (protocString.equals("https")) protocString = "http"; // the full qualified class name - String className = this.getClass().getPackage().getName() + "." + protocString + ".ResourceInfo"; - - // loading class by name - Class moduleClass = Class.forName(className); - - // getting the constructor - Constructor classConstructor = moduleClass.getConstructor( new Class[] { - URL.class, - Map.class - } ); - - // instantiating class - IResourceInfo infoObject = (IResourceInfo) classConstructor.newInstance(new Object[] { - resourceURL, - resourceMetadata - }); - - // return the newly created object - return infoObject; + final String className = this.getClass().getPackage().getName() + "." + protocString + ".ResourceInfo"; + try { + // loading class by name + final Class moduleClass = Class.forName(className); + + // getting the constructor + final Constructor classConstructor = moduleClass.getConstructor( new Class[] { + URL.class, + Map.class + } ); + + // instantiating class + final IResourceInfo infoObject = (IResourceInfo) classConstructor.newInstance(new Object[] { + resourceURL, + resourceMetadata + }); + + // return the newly created object + return infoObject; + } catch (Exception e) { + if (e instanceof RuntimeException) { + throw (RuntimeException)e; + } else if (e instanceof ClassNotFoundException) { + throw new UnsupportedProtocolException(protocString, e); + } else if (e instanceof IllegalAccessException) { + throw (IllegalAccessException)e; + } else { + e.printStackTrace(); + return null; + } + } } } diff --git a/source/de/anomic/plasma/cache/UnsupportedProtocolException.java b/source/de/anomic/plasma/cache/UnsupportedProtocolException.java new file mode 100644 index 000000000..3f65470a8 --- /dev/null +++ b/source/de/anomic/plasma/cache/UnsupportedProtocolException.java @@ -0,0 +1,21 @@ +package de.anomic.plasma.cache; + +/** + * This exception is thrown when a protocol (or a derivative using this protocol) is not + * supported, as is the case in the {@link ResourceInfoFactory}. + * @see package {@link de.anomic.plasma.cache} for all {@link IResourceInfo}s available + */ +public class UnsupportedProtocolException extends Exception { + + private static final long serialVersionUID = 1L; + + public static final String MESSAGE = "Unsupported protocol error: "; + + public UnsupportedProtocolException(String protocol) { + super(MESSAGE + protocol); + } + + public UnsupportedProtocolException(String protocol, Throwable cause) { + super(MESSAGE + protocol, cause); + } +} diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 9f56ddfd8..87ff83c97 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -82,6 +82,7 @@ import de.anomic.kelondro.kelondroMapObjects; import de.anomic.net.URL; import de.anomic.plasma.cache.IResourceInfo; import de.anomic.plasma.cache.ResourceInfoFactory; +import de.anomic.plasma.cache.UnsupportedProtocolException; import de.anomic.server.serverCodings; import de.anomic.server.serverFileUtils; import de.anomic.server.serverInstantThread; @@ -496,11 +497,14 @@ public final class plasmaHTCache { /** * Returns an object containing metadata about a cached resource - * @param url the url of the resource - * @return an {@link IResourceInfo info object} - * @throws Exception of the info object could not be created, e.g. if the protocol is not supported + * @param url the {@link URL} of the resource + * @return an {@link IResourceInfo info object} + * @throws IllegalAccessException if the {@link SecurityManager} doesn't allow instantiation + * of the info object with the given protocol + * @throws UnsupportedProtocolException if the protocol is not supported and therefore the + * info object couldn't be created */ - public IResourceInfo loadResourceInfo(URL url) throws Exception { + public IResourceInfo loadResourceInfo(URL url) throws UnsupportedProtocolException, IllegalAccessException { // getting the URL hash String urlHash = plasmaURL.urlHash(url.toNormalform()); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 148db31c1..8cf28d619 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1510,6 +1510,28 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser initProfiles(); } + /** + * {@link plasmaCrawlProfile Crawl Profiles} are saved independantly from the queues themselves + * and therefore have to be cleaned up from time to time. This method only performs the clean-up + * if - and only if - the {@link plasmaSwitchboardQueue switchboard}, + * {@link plasmaCrawlLoader loader} and {@link plasmaCrawlNURL local crawl} queues are all empty. + *

+ * Then it iterates through all existing {@link plasmaCrawlProfile crawl profiles} and removes + * all profiles which are not hardcoded. + *

+ *

+ * If this method encounters DB-failures, the profile DB will be resetted and + * true will be returned + *

+ * @see #CRAWL_PROFILE_PROXY hardcoded + * @see #CRAWL_PROFILE_REMOTE hardcoded + * @see #CRAWL_PROFILE_SNIPPET_TEXT hardcoded + * @see #CRAWL_PROFILE_SNIPPET_MEDIA hardcoded + * @return whether this method has done something or not (i.e. because the queues have been filled + * or there are no profiles left to clean up) + * @throws InterruptedException if the current thread has been interrupted, i.e. by the + * shutdown procedure + */ public boolean cleanProfiles() throws InterruptedException { if ((sbQueue.size() > 0) || (cacheLoader.size() > 0) || (noticeURL.stackSize() > 0)) return false; final Iterator iter = profiles.profiles(true);