diff --git a/htroot/CacheAdmin_p.html b/htroot/CacheAdmin_p.html
index 21e77006c..eafa7464d 100644
--- a/htroot/CacheAdmin_p.html
+++ b/htroot/CacheAdmin_p.html
@@ -96,7 +96,12 @@
#[line]##{/lines}#
::- This file is not cached -
- ::#(/type)#
+ ::
+ ::- The protocol #[protoc]# is not supported by YaCy
+ ::
+ - IllegalAccessException -Security Manager is blocking dynamic class loading
+ but should not be active. Please report this incident!
+ #(/type)#
::
diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java
index da1b691ac..8f0494c2b 100644
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@@ -52,6 +52,7 @@
import java.io.File;
import java.io.FilenameFilter;
+import java.io.IOException;
import java.io.Writer;
import java.util.Iterator;
import java.util.Map;
@@ -66,6 +67,7 @@ import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.cache.IResourceInfo;
+import de.anomic.plasma.cache.UnsupportedProtocolException;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@@ -80,6 +82,8 @@ public class CacheAdmin_p {
private static final int HtmlFile = 0;
private static final int NotCached = 1;
private static final int Image = 2;
+ private static final int ProtocolError = 3;
+ private static final int SecurityError = 4;
public static final class Filter implements FilenameFilter {
private static final String EXCLUDE_NAME = plasmaHTCache.DB_NAME;
@@ -105,7 +109,7 @@ public class CacheAdmin_p {
pathString = "/";
file = new File(switchboard.htCachePath, pathString);
}
- } catch (Exception e) {
+ } catch (IOException e) {
pathString = "/";
file = new File(switchboard.htCachePath, pathString);
}
@@ -129,62 +133,69 @@ public class CacheAdmin_p {
info.ensureCapacity(10000);
try {
final IResourceInfo resInfo = switchboard.cacheManager.loadResourceInfo(url);
- formatHeader(prop, resInfo.getMap());
-
- final String ff = file.toString();
- final int dotpos = ff.lastIndexOf('.');
- final String ext = (dotpos >= 0) ? ff.substring(dotpos + 1).toLowerCase() : "";
- if (ext.equals("gif") || ext.equals("jpg") ||
- ext.equals("png") || ext.equals("jpeg")) {
- prop.put("info_type", Image);
- prop.put("info_type_src", pathString);
+ if (resInfo == null) {
+ prop.put("info_type", NotCached);
} else {
- prop.put("info_type", HtmlFile);
- // fill the htmlFilerContentScraper object with the contents of the cached file
- // to retrieve all needed information
- final htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
- //final OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
- Writer writer = new htmlFilterWriter(null,null,scraper,null,false);
- String sourceCharset = resInfo.getCharacterEncoding();
- if (sourceCharset == null) sourceCharset = "UTF-8";
- String mimeType = resInfo.getMimeType();
- serverFileUtils.copy(file, sourceCharset, writer);
- writer.close();
-
- final plasmaParserDocument document = switchboard.parser.transformScraper(url, mimeType, sourceCharset, scraper);
-
- prop.put("info_type_title", scraper.getTitle());
-
- int i;
- String[] t = document.getSectionTitles();
- prop.put("info_type_headlines", t.length);
- for (i = 0; i < t.length; i++)
- prop.put("info_type_headlines_" + i + "_headline",
- t[i].replaceAll("\n", "").trim());
-
- formatAnchor(prop, document.getHyperlinks(), "links");
- formatImageAnchor(prop, document.getImages());
- formatAnchor(prop, document.getAudiolinks(), "audio");
- formatAnchor(prop, document.getVideolinks(), "video");
- formatAnchor(prop, document.getApplinks(), "apps");
- formatAnchor(prop, document.getEmaillinks(), "email");
+ formatHeader(prop, resInfo.getMap());
- prop.put("info_type_text",
- de.anomic.data.htmlTools.replaceXMLEntities(new String(scraper.getText())));
-
- i = 0;
- final Iterator sentences = document.getSentences(false);
- if (sentences != null)
- while (sentences.hasNext()) {
- prop.put("info_type_lines_" + i + "_line",
- new String((StringBuffer) sentences.next()).replaceAll("\n", "").trim());
- i++;
- }
- prop.put("info_type_lines", i);
- if (document != null) document.close();
+ final String ff = file.toString();
+ final int dotpos = ff.lastIndexOf('.');
+ final String ext = (dotpos >= 0) ? ff.substring(dotpos + 1).toLowerCase() : "";
+ if (ext.equals("gif") || ext.equals("jpg") ||
+ ext.equals("png") || ext.equals("jpeg")) {
+ prop.put("info_type", Image);
+ prop.put("info_type_src", pathString);
+ } else {
+ prop.put("info_type", HtmlFile);
+ // fill the htmlFilerContentScraper object with the contents of the cached file
+ // to retrieve all needed information
+ final htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
+ //final OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
+ Writer writer = new htmlFilterWriter(null,null,scraper,null,false);
+ String sourceCharset = resInfo.getCharacterEncoding();
+ if (sourceCharset == null) sourceCharset = "UTF-8";
+ String mimeType = resInfo.getMimeType();
+ serverFileUtils.copy(file, sourceCharset, writer);
+ writer.close();
+
+ final plasmaParserDocument document = switchboard.parser.transformScraper(url, mimeType, sourceCharset, scraper);
+
+ prop.put("info_type_title", scraper.getTitle());
+
+ int i;
+ String[] t = document.getSectionTitles();
+ prop.put("info_type_headlines", t.length);
+ for (i = 0; i < t.length; i++)
+ prop.put("info_type_headlines_" + i + "_headline",
+ t[i].replaceAll("\n", "").trim());
+
+ formatAnchor(prop, document.getHyperlinks(), "links");
+ formatImageAnchor(prop, document.getImages());
+ formatAnchor(prop, document.getAudiolinks(), "audio");
+ formatAnchor(prop, document.getVideolinks(), "video");
+ formatAnchor(prop, document.getApplinks(), "apps");
+ formatAnchor(prop, document.getEmaillinks(), "email");
+
+ prop.put("info_type_text", new String(scraper.getText()));
+
+ i = 0;
+ final Iterator sentences = document.getSentences(false);
+ if (sentences != null)
+ while (sentences.hasNext()) {
+ prop.put("info_type_lines_" + i + "_line",
+ new String((StringBuffer) sentences.next()).replaceAll("\n", "").trim());
+ i++;
+ }
+ prop.put("info_type_lines", i);
+ if (document != null) document.close();
+ }
}
- } catch (Exception e) {
+ } catch (IOException e) {
prop.put("info_type", NotCached);
+ } catch (UnsupportedProtocolException e) {
+ prop.put("info_type", ProtocolError);
+ } catch (IllegalAccessException e) {
+ prop.put("info_type", SecurityError);
}
} else {
prop.put("info", TypeDIR);
@@ -234,11 +245,12 @@ public class CacheAdmin_p {
}
}
}
-
+
prop.put("cachesize", Long.toString(switchboard.cacheManager.curCacheSize/1024));
prop.put("cachemax", Long.toString(switchboard.cacheManager.maxCacheSize/1024));
prop.put("path", path.toString());
prop.put("info_info", info.toString());
+
/* prop.put("info_tree", tree.toString()); */
// return rewrite properties
return prop;
diff --git a/htroot/IndexCreateWWWLocalQueue_p.html b/htroot/IndexCreateWWWLocalQueue_p.html
index be7bb9d21..9acc906f5 100644
--- a/htroot/IndexCreateWWWLocalQueue_p.html
+++ b/htroot/IndexCreateWWWLocalQueue_p.html
@@ -21,12 +21,12 @@
Delete Entries:
This may take a quite long time.
diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateWWWLocalQueue_p.java
index 89f46a70e..8d30e2be9 100644
--- a/htroot/IndexCreateWWWLocalQueue_p.java
+++ b/htroot/IndexCreateWWWLocalQueue_p.java
@@ -70,6 +70,14 @@ public class IndexCreateWWWLocalQueue_p {
return dayFormatter.format(date);
}
+ private static final int INVALID = 0;
+ private static final int URL = 1;
+ private static final int ANCHOR = 2;
+ private static final int PROFILE = 3;
+ private static final int DEPTH = 4;
+ private static final int INITIATOR = 5;
+ private static final int MODIFIED = 6;
+
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
// return variable that accumulates replacements
plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
@@ -87,55 +95,59 @@ public class IndexCreateWWWLocalQueue_p {
int c = 0;
String pattern = post.get("pattern", ".*").trim();
- String option = post.get("option", ".*").trim();
+ final int option = post.getInt("option", INVALID);
if (pattern.equals(".*")) {
c = switchboard.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
switchboard.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE);
try { switchboard.cleanProfiles(); } catch (InterruptedException e) {/* ignore this */}
- } else{
+ } else if (option > INVALID) {
Pattern compiledPattern = null;
try {
// compiling the regular expression
compiledPattern = Pattern.compile(pattern);
- // iterating through the list of URLs
- Iterator iter = switchboard.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
- plasmaCrawlEntry entry;
- while (iter.hasNext()) {
- if ((entry = (plasmaCrawlEntry) iter.next()) == null) continue;
- String value = null;
- String nextHash = entry.urlhash();
- if ((option.equals("URL")&&(entry.url() != null))) {
- value = entry.url().toString();
- } else if ((option.equals("AnchorName"))) {
- value = entry.name();
- } else if ((option.equals("Profile"))) {
- String profileHandle = entry.profileHandle();
- if (profileHandle == null) {
- value = "unknown";
- } else {
- plasmaCrawlProfile.entry profile = switchboard.profiles.getEntry(profileHandle);
- if (profile == null) {
- value = "unknown";
- } else {
- value = profile.name();
- }
- }
- } else if ((option.equals("Depth"))) {
- value = Integer.toString(entry.depth());
- } else if ((option.equals("Initiator"))) {
- value = (entry.initiator()==null)?"proxy":htmlTools.replaceHTML(entry.initiator());
- } else if ((option.equals("ModifiedDate"))) {
- value = daydate(entry.loaddate());
+ if (option == PROFILE) {
+ // search and delete the crawl profile (_much_ faster, independant of queue size)
+ // XXX: what to do about the annoying LOST PROFILE messages in the log?
+ Iterator it = switchboard.profiles.profiles(true);
+ plasmaCrawlProfile.entry entry;
+ while (it.hasNext()) {
+ entry = (plasmaCrawlProfile.entry)it.next();
+ final String name = entry.name();
+ if (name.equals(plasmaSwitchboard.CRAWL_PROFILE_PROXY) ||
+ name.equals(plasmaSwitchboard.CRAWL_PROFILE_REMOTE) ||
+ name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_TEXT) ||
+ name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA))
+ continue;
+ if (compiledPattern.matcher(name).find())
+ switchboard.profiles.removeEntry(entry.handle());
}
-
- if (value != null) {
- Matcher matcher = compiledPattern.matcher(value);
- if (matcher.find()) {
- switchboard.noticeURL.remove(nextHash);
- }
+ } else {
+ // iterating through the list of URLs
+ Iterator iter = switchboard.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
+ plasmaCrawlEntry entry;
+ while (iter.hasNext()) {
+ if ((entry = (plasmaCrawlEntry) iter.next()) == null) continue;
+ String value = null;
+
+ switch (option) {
+ case URL: value = (entry.url() == null) ? null : entry.url().toString(); break;
+ case ANCHOR: value = entry.name(); break;
+ case DEPTH: value = Integer.toString(entry.depth()); break;
+ case INITIATOR:
+ value = (entry.initiator() == null) ? "proxy" : htmlTools.replaceHTML(entry.initiator());
+ break;
+ case MODIFIED: value = daydate(entry.loaddate()); break;
+ default: value = null;
+ }
+
+ if (value != null) {
+ Matcher matcher = compiledPattern.matcher(value);
+ if (matcher.find()) {
+ switchboard.noticeURL.remove(entry.urlhash());
+ }
+ }
}
-
}
} catch (PatternSyntaxException e) {
e.printStackTrace();
diff --git a/source/de/anomic/plasma/cache/ResourceInfoFactory.java b/source/de/anomic/plasma/cache/ResourceInfoFactory.java
index 6e8a220f2..54f281dc4 100644
--- a/source/de/anomic/plasma/cache/ResourceInfoFactory.java
+++ b/source/de/anomic/plasma/cache/ResourceInfoFactory.java
@@ -57,7 +57,7 @@ public class ResourceInfoFactory {
public IResourceInfo buildResourceInfoObj(
URL resourceURL,
Map resourceMetadata
- ) throws Exception {
+ ) throws UnsupportedProtocolException, IllegalAccessException {
String protocString = resourceURL.getProtocol();
@@ -65,25 +65,37 @@ public class ResourceInfoFactory {
if (protocString.equals("https")) protocString = "http";
// the full qualified class name
- String className = this.getClass().getPackage().getName() + "." + protocString + ".ResourceInfo";
-
- // loading class by name
- Class moduleClass = Class.forName(className);
-
- // getting the constructor
- Constructor classConstructor = moduleClass.getConstructor( new Class[] {
- URL.class,
- Map.class
- } );
-
- // instantiating class
- IResourceInfo infoObject = (IResourceInfo) classConstructor.newInstance(new Object[] {
- resourceURL,
- resourceMetadata
- });
-
- // return the newly created object
- return infoObject;
+ final String className = this.getClass().getPackage().getName() + "." + protocString + ".ResourceInfo";
+ try {
+ // loading class by name
+ final Class moduleClass = Class.forName(className);
+
+ // getting the constructor
+ final Constructor classConstructor = moduleClass.getConstructor( new Class[] {
+ URL.class,
+ Map.class
+ } );
+
+ // instantiating class
+ final IResourceInfo infoObject = (IResourceInfo) classConstructor.newInstance(new Object[] {
+ resourceURL,
+ resourceMetadata
+ });
+
+ // return the newly created object
+ return infoObject;
+ } catch (Exception e) {
+ if (e instanceof RuntimeException) {
+ throw (RuntimeException)e;
+ } else if (e instanceof ClassNotFoundException) {
+ throw new UnsupportedProtocolException(protocString, e);
+ } else if (e instanceof IllegalAccessException) {
+ throw (IllegalAccessException)e;
+ } else {
+ e.printStackTrace();
+ return null;
+ }
+ }
}
}
diff --git a/source/de/anomic/plasma/cache/UnsupportedProtocolException.java b/source/de/anomic/plasma/cache/UnsupportedProtocolException.java
new file mode 100644
index 000000000..3f65470a8
--- /dev/null
+++ b/source/de/anomic/plasma/cache/UnsupportedProtocolException.java
@@ -0,0 +1,21 @@
+package de.anomic.plasma.cache;
+
+/**
+ * This exception is thrown when a protocol (or a derivative using this protocol) is not
+ * supported, as is the case in the {@link ResourceInfoFactory}.
+ * @see package {@link de.anomic.plasma.cache} for all {@link IResourceInfo}s available
+ */
+public class UnsupportedProtocolException extends Exception {
+
+ private static final long serialVersionUID = 1L;
+
+ public static final String MESSAGE = "Unsupported protocol error: ";
+
+ public UnsupportedProtocolException(String protocol) {
+ super(MESSAGE + protocol);
+ }
+
+ public UnsupportedProtocolException(String protocol, Throwable cause) {
+ super(MESSAGE + protocol, cause);
+ }
+}
diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java
index 9f56ddfd8..87ff83c97 100644
--- a/source/de/anomic/plasma/plasmaHTCache.java
+++ b/source/de/anomic/plasma/plasmaHTCache.java
@@ -82,6 +82,7 @@ import de.anomic.kelondro.kelondroMapObjects;
import de.anomic.net.URL;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.cache.ResourceInfoFactory;
+import de.anomic.plasma.cache.UnsupportedProtocolException;
import de.anomic.server.serverCodings;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverInstantThread;
@@ -496,11 +497,14 @@ public final class plasmaHTCache {
/**
* Returns an object containing metadata about a cached resource
- * @param url the url of the resource
- * @return an {@link IResourceInfo info object}
- * @throws Exception of the info object could not be created, e.g. if the protocol is not supported
+ * @param url the {@link URL} of the resource
+ * @return an {@link IResourceInfo info object}
+ * @throws IllegalAccessException if the {@link SecurityManager} doesn't allow instantiation
+ * of the info object with the given protocol
+ * @throws UnsupportedProtocolException if the protocol is not supported and therefore the
+ * info object couldn't be created
*/
- public IResourceInfo loadResourceInfo(URL url) throws Exception {
+ public IResourceInfo loadResourceInfo(URL url) throws UnsupportedProtocolException, IllegalAccessException {
// getting the URL hash
String urlHash = plasmaURL.urlHash(url.toNormalform());
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 148db31c1..8cf28d619 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -1510,6 +1510,28 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
initProfiles();
}
+ /**
+ * {@link plasmaCrawlProfile Crawl Profiles} are saved independantly from the queues themselves
+ * and therefore have to be cleaned up from time to time. This method only performs the clean-up
+ * if - and only if - the {@link plasmaSwitchboardQueue switchboard},
+ * {@link plasmaCrawlLoader loader} and {@link plasmaCrawlNURL local crawl} queues are all empty.
+ *
+ * Then it iterates through all existing {@link plasmaCrawlProfile crawl profiles} and removes
+ * all profiles which are not hardcoded.
+ *
+ *
+ * If this method encounters DB-failures, the profile DB will be resetted and
+ * true will be returned
+ *
+ * @see #CRAWL_PROFILE_PROXY hardcoded
+ * @see #CRAWL_PROFILE_REMOTE hardcoded
+ * @see #CRAWL_PROFILE_SNIPPET_TEXT hardcoded
+ * @see #CRAWL_PROFILE_SNIPPET_MEDIA hardcoded
+ * @return whether this method has done something or not (i.e. because the queues have been filled
+ * or there are no profiles left to clean up)
+ * @throws InterruptedException if the current thread has been interrupted, i.e. by the
+ * shutdown procedure
+ */
public boolean cleanProfiles() throws InterruptedException {
if ((sbQueue.size() > 0) || (cacheLoader.size() > 0) || (noticeURL.stackSize() > 0)) return false;
final Iterator iter = profiles.profiles(true);