diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java
index 6f23c5b77..702244c6e 100644
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@@ -114,11 +114,12 @@ public class CacheAdmin_p {
else {
htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
+ plasmaParser.document document = switchboard.parser.transformScraper(url, "text/html", scraper);
serverFileUtils.copy(file, os);
info += "HEADLINE:
" + scraper.getHeadline() + "
";
- info += "HREF:
" + formatAnchor(scraper.getHyperlinks()) + "
";
- info += "MEDIA:
" + formatAnchor(scraper.getMedialinks()) + "
";
- info += "EMAIL:
" + formatAnchor(scraper.getEmaillinks()) + "
";
+ info += "HREF:
" + formatAnchor(document.getHyperlinks()) + "
";
+ info += "MEDIA:
" + formatAnchor(document.getMedialinks()) + "
";
+ info += "EMAIL:
" + formatAnchor(document.getEmaillinks()) + "
";
info += "TEXT:
" + new String(scraper.getText()) + "
";
}
} catch (Exception e) {
diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java
index ef147f490..e2be76047 100644
--- a/htroot/IndexCreate_p.java
+++ b/htroot/IndexCreate_p.java
@@ -305,7 +305,7 @@ public class IndexCreate_p {
prop.put("indexing-queue_list_"+i+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("indexing-queue_list_"+i+"_depth", pcentry.depth);
prop.put("indexing-queue_list_"+i+"_modified", daydate(pcentry.lastModified));
- prop.put("indexing-queue_list_"+i+"_href",((pcentry.scraper == null) ? "0" : ("" + pcentry.scraper.getHyperlinks().size())));
+ prop.put("indexing-queue_list_"+i+"_href",((pcentry.scraper == null) ? "0" : ("" + pcentry.scraper.getAnchors().size())));
prop.put("indexing-queue_list_"+i+"_anchor", ((pcentry.scraper == null) ? "-" : pcentry.scraper.getHeadline()) );
prop.put("indexing-queue_list_"+i+"_url", pcentry.urlString);
dark = !dark;
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index 48ca7c8d7..c3b7343e0 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -52,10 +52,6 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
private static HashSet linkTags0;
private static HashSet linkTags1;
- public static String mediaExt =
- "swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," +
- "sit,hqx,img,dmg,tar,gz,ps,pdf,doc,xls,ppt,ram,bz2,arj";
-
static {
linkTags0 = new HashSet();
linkTags0.add("img");
@@ -67,8 +63,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
}
// class variables: collectors for links
- private Properties anchor;
- private Properties image;
+ private HashMap anchors;
+ private HashMap images;
private String title;
private String headline;
private serverByteBuffer text;
@@ -79,8 +75,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
// it is only the reference for relative links
super(linkTags0, linkTags1);
this.root = root;
- this.anchor = new Properties();
- this.image = new Properties();
+ this.anchors = new HashMap();
+ this.images = new HashMap();
this.title = "";
this.headline = "";
this.text = new serverByteBuffer();
@@ -117,12 +113,12 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
}
public void scrapeTag0(String tagname, Properties tagopts) {
- if (tagname.equals("img")) image.setProperty(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt",""));
+ if (tagname.equals("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt",""));
}
public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
//System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
- if (tagname.equals("a")) anchor.setProperty(absolutePath(tagopts.getProperty("href", "")),
+ if (tagname.equals("a")) anchors.put(absolutePath(tagopts.getProperty("href", "")),
new serverByteBuffer(super.stripAll(new serverByteBuffer(text)).getBytes()).trim().toString());
if (tagname.equals("h1")) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
if (tagname.equals("title")) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
@@ -153,179 +149,19 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return text.getBytes();
}
- public Properties getAnchor() {
- return anchor;
- }
-
- public Properties getImage() {
- return image;
- }
-
- public Map getHyperlinks() {
- if (hyperlinks == null) resortLinks();
- return hyperlinks;
- }
-
- public Map getMedialinks() {
- if (medialinks == null) resortLinks();
- return medialinks;
- }
-
- public Map getEmaillinks() {
- if (emaillinks == null) resortLinks();
- return emaillinks;
- }
-
- HashMap hyperlinks = null;
- HashMap medialinks = null;
- HashMap emaillinks = null;
-
- private synchronized void resortLinks() {
- Iterator i;
- String url;
- int extpos;
- String ext;
- i = anchor.entrySet().iterator();
- hyperlinks = new HashMap();
- medialinks = new HashMap();
- emaillinks = new HashMap();
- Map.Entry entry;
- while (i.hasNext()) {
- entry = (Map.Entry) i.next();
- url = (String) entry.getKey();
- if ((url != null) && (url.startsWith("mailto:"))) {
- emaillinks.put(url.substring(7), entry.getValue());
- } else {
- extpos = url.lastIndexOf(".");
- String normal;
- if (extpos > 0) {
- ext = url.substring(extpos).toLowerCase();
- normal = urlNormalform(url);
- if (normal != null) {
- if (mediaExt.indexOf(ext.substring(1)) >= 0) {
- // this is not an normal anchor, its a media link
- medialinks.put(normal, entry.getValue());
- } else {
- hyperlinks.put(normal, entry.getValue());
- }
- }
- }
- }
- }
- // finally add the images to the medialinks
- i = image.entrySet().iterator();
- String normal;
- while (i.hasNext()) {
- entry = (Map.Entry) i.next();
- url = (String) entry.getKey();
- normal = urlNormalform(url);
- if (normal != null) medialinks.put(normal, entry.getValue()); // avoid NullPointerException
- }
- expandHyperlinks();
- }
-
- /*
- private synchronized void resortLinks() {
- Enumeration e;
- String url;
- int extpos;
- String ext;
- e = anchor.propertyNames();
- hyperlinks = new Properties();
- medialinks = new Properties();
- emaillinks = new Properties();
- while (e.hasMoreElements()) {
- url = (String) e.nextElement();
- if ((url != null) && (url.startsWith("mailto:"))) {
- emaillinks.setProperty(url.substring(7), anchor.getProperty(url));
- } else {
- extpos = url.lastIndexOf(".");
- String normal;
- if (extpos > 0) {
- ext = url.substring(extpos).toLowerCase();
- normal = urlNormalform(url);
- if (normal != null) {
- if (mediaExt.indexOf(ext.substring(1)) >= 0) {
- // this is not an normal anchor, its a media link
- medialinks.setProperty(normal, anchor.getProperty(url));
- } else {
- hyperlinks.setProperty(normal, anchor.getProperty(url));
- }
- }
- }
- }
- }
- // finally add the images to the medialinks
- e = image.propertyNames();
- String normal;
- while (e.hasMoreElements()) {
- url = (String) e.nextElement();
- normal = urlNormalform(url);
- if (normal != null) medialinks.setProperty(normal, image.getProperty(url)); // avoid NullPointerException
- }
+ public Map getAnchors() {
+ return anchors;
}
-*/
- public synchronized void expandHyperlinks() {
- // we add artificial hyperlinks to the hyperlink set that can be calculated from
- // given hyperlinks and imagelinks
- hyperlinks.putAll(allReflinks(hyperlinks));
- hyperlinks.putAll(allReflinks(medialinks));
- hyperlinks.putAll(allSubpaths(hyperlinks));
- hyperlinks.putAll(allSubpaths(medialinks));
+ public Map getImages() {
+ return images;
}
- private static Map allReflinks(Map links) {
- // we find all links that are part of a reference inside a url
- HashMap v = new HashMap();
- Iterator i = links.keySet().iterator();
- String s;
- int pos;
- loop: while (i.hasNext()) {
- s = (String) i.next();
- if ((pos = s.toLowerCase().indexOf("http://",7)) > 0) {
- i.remove();
- s = s.substring(pos);
- while ((pos = s.toLowerCase().indexOf("http://",7)) > 0) s = s.substring(pos);
- if (!(v.containsKey(s))) v.put(s, "ref");
- continue loop;
- }
- if ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) {
- i.remove();
- s = "http:/" + s.substring(pos);
- while ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) s = "http:/" + s.substring(pos);
- if (!(v.containsKey(s))) v.put(s, "ref");
- continue loop;
- }
- }
- return v;
- }
-
- private static Map allSubpaths(Map links) {
- HashMap v = new HashMap();
- Iterator i = links.keySet().iterator();
- String s;
- int pos;
- while (i.hasNext()) {
- s = (String) i.next();
- if (s.endsWith("/")) s = s.substring(0, s.length() - 1);
- pos = s.lastIndexOf("/");
- while (pos > 8) {
- s = s.substring(0, pos + 1);
- if (!(v.containsKey(s))) v.put(s, "sub");
- s = s.substring(0, pos);
- pos = s.lastIndexOf("/");
- }
- }
- return v;
- }
-
-
public void print() {
System.out.println("TITLE :" + title);
System.out.println("HEADLINE:" + headline);
- System.out.println("ANCHORS :" + anchor.toString());
- System.out.println("IMAGES :" + image.toString());
+ System.out.println("ANCHORS :" + anchors.toString());
+ System.out.println("IMAGES :" + images.toString());
System.out.println("TEXT :" + new String(text.getBytes()));
}
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
index 6d187d877..4e2baa5ee 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
@@ -65,6 +65,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
}
public void init(String initarg) {
+ System.out.println("Transformer init: " + initarg);
if (bluelist == null) {
// here, the initarg is used to load a list of bluelisted words
bluelist = new Vector();
@@ -78,9 +79,14 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
r.close();
} catch (Exception e) {
}
+ if (bluelist.size() == 0) System.out.println("BLUELIST is empty");
}
}
+ public boolean isIdentityTransformer() {
+ return bluelist.size() == 0;
+ }
+
private static byte[] genBlueLetters(int length) {
serverByteBuffer bb = new serverByteBuffer(" ".getBytes());
length = length / 2;
diff --git a/source/de/anomic/htmlFilter/htmlFilterTransformer.java b/source/de/anomic/htmlFilter/htmlFilterTransformer.java
index 816cf8138..bc68336ed 100644
--- a/source/de/anomic/htmlFilter/htmlFilterTransformer.java
+++ b/source/de/anomic/htmlFilter/htmlFilterTransformer.java
@@ -49,6 +49,11 @@ public interface htmlFilterTransformer {
// more specific transformation rules
public void init(String initarg);
+ // ask if this transformer will do any transformation whatsoever
+ // this may return true if the initialization resultet in a status
+ // that does not allow any transformation
+ public boolean isIdentityTransformer();
+
// tests, if a given body-less tag (i.e.
shall be supervised)
// only tags that are defined here will be cached and not streamed
public boolean isTag0(String tag);
diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java
index 7cbd696ee..6efb0ec6d 100644
--- a/source/de/anomic/http/httpdProxyHandler.java
+++ b/source/de/anomic/http/httpdProxyHandler.java
@@ -125,15 +125,9 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
if (!(htRootPath.exists())) htRootPath.mkdir();
}
- // load a transformer
- try {
- ClassLoader cp = new serverClassLoader(this.getClass().getClassLoader());
- Class transformerClass = cp.loadClass(switchboard.getConfig("pageTransformerClass", ""));
- transformer = (htmlFilterTransformer) transformerClass.newInstance();
- transformer.init(switchboard.getConfig("pageTransformerArg", "")); // this is usually the blueList
- } catch (Exception e) {
- transformer = null;
- }
+ // load a transformer
+ transformer = new htmlFilterContentTransformer();
+ transformer.init(new File(switchboard.getRootPath(), switchboard.getConfig("plasmaBlueList", "")).toString());
String f;
// load the yellow-list
@@ -396,7 +390,7 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
if (cacheExists) {
// we respond on the request by using the cache
- hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK", cachedResponseHeader, null, null, switchboard.defaultProxyProfile);
+ hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK", cachedResponseHeader, null, switchboard.defaultProxyProfile);
if (hpc.shallUseCache()) {
// the cache is fresh
@@ -426,7 +420,8 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
respondHeader(respond, "203 OK", cachedResponseHeader); // respond with 'non-authoritative'
// make a transformer
- if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
+ if ((!(transformer.isIdentityTransformer())) &&
+ ((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
((cachedResponseHeader == null) || (httpd.isTextMime(cachedResponseHeader.mime(), switchboard.mimeWhite)))) {
hfos = new htmlFilterOutputStream(respond, null, transformer, (ext.length() == 0));
} else {
@@ -472,24 +467,30 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
res = remote.GET(remotePath, requestHeader);
long contentLength = res.responseHeader.contentLength();
- // make a scraper and transformer
+ // reserver cache entry
+ hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile);
+
+ // make a scraper and transformer
if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
(httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) {
- scraper = new htmlFilterContentScraper(url);
- hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0));
- if (((htmlFilterOutputStream) hfos).binarySuspect()) {
- scraper = null; // forget it, may be rubbish
- log.logDebug("Content of " + url + " is probably binary. deleted scraper.");
+ if (transformer.isIdentityTransformer()) {
+ hfos = hpc.getContentOutputStream();
+ } else {
+ scraper = new htmlFilterContentScraper(url);
+ hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0));
+ if (((htmlFilterOutputStream) hfos).binarySuspect()) {
+ scraper = null; // forget it, may be rubbish
+ log.logDebug("Content of " + url + " is probably binary. deleted scraper.");
+ }
+ hpc.scraper = scraper;
}
} else {
log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped");
scraper = null;
hfos = respond;
+ hpc.scraper = scraper;
}
- // reserver cache entry
- hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, scraper, null, switchboard.defaultProxyProfile);
-
// handle incoming cookies
handleIncomingCookies(res.responseHeader, host, ip);
@@ -502,7 +503,13 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
if ((contentLength > 0) && // known
(contentLength < 1048576)) // 1 MB
{
- byte[] cacheArray = res.writeContent(hfos);
+ byte[] cacheArray;
+ if (transformer.isIdentityTransformer()) {
+ res.writeContentX(hfos, respond);
+ cacheArray = hpc.getContentBytes();
+ } else {
+ cacheArray = res.writeContent(hfos);
+ }
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
// before we came here we deleted a cache entry
if (sizeBeforeDelete == cacheArray.length) {
@@ -514,8 +521,16 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
cacheManager.stackProcess(hpc, cacheArray); // necessary update, write response header to cache
}
} else {
+ // the file is too big to cache it in the ram, write to file
cacheFile.getParentFile().mkdirs();
- res.writeContent(hfos, cacheFile);
+ if (transformer.isIdentityTransformer()) {
+ res.writeContent(respond, cacheFile);
+ if (contentLength < 10485760) { // 10 mb
+ serverFileUtils.copy(cacheFile, hfos);
+ } // else hfos is empty and that means: no work afterwards with it
+ } else {
+ res.writeContent(hfos, cacheFile);
+ }
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
// before we came here we deleted a cache entry
if (sizeBeforeDelete == cacheFile.length()) {
@@ -579,24 +594,30 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
httpc.response res = remote.GET(remotePath, requestHeader);
long contentLength = res.responseHeader.contentLength();
- // make a scraper and transformer
+ // reserve cache entry
+ hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile);
+
+ // make a scraper and transformer
if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
(httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) {
- scraper = new htmlFilterContentScraper(url);
- hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0));
- if (((htmlFilterOutputStream) hfos).binarySuspect()) {
- scraper = null; // forget it, may be rubbish
- log.logDebug("Content of " + url + " is probably binary. deleted scraper.");
+ if (transformer.isIdentityTransformer()) {
+ hfos = hpc.getContentOutputStream();
+ } else {
+ scraper = new htmlFilterContentScraper(url);
+ hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0));
+ if (((htmlFilterOutputStream) hfos).binarySuspect()) {
+ scraper = null; // forget it, may be rubbish
+ log.logDebug("Content of " + url + " is probably binary. deleted scraper.");
+ }
+ hpc.scraper = scraper;
}
- } else {
+ } else {
log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped");
scraper = null;
hfos = respond;
+ hpc.scraper = scraper;
}
- // reserve cache entry
- hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, scraper, null, switchboard.defaultProxyProfile);
-
// handle incoming cookies
handleIncomingCookies(res.responseHeader, host, ip);
@@ -608,16 +629,29 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
if ((storeError = hpc.shallStoreCache()) == null) {
// we write a new cache entry
if ((contentLength > 0) && (contentLength < 1048576)) {
- // write to buffer
- byte[] cacheArray = res.writeContent(hfos);
- if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
+ // write to buffer
+ byte[] cacheArray;
+ if (transformer.isIdentityTransformer()) {
+ res.writeContentX(hfos, respond);
+ cacheArray = hpc.getContentBytes();
+ } else {
+ cacheArray = res.writeContent(hfos);
+ }
+ if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
// enQueue new entry with response header and file as byte[]
hpc.status = plasmaHTCache.CACHE_FILL;
cacheManager.stackProcess(hpc, cacheArray);
} else try {
// write to file system directly
cacheFile.getParentFile().mkdirs();
- res.writeContent(hfos, cacheFile);
+ if (transformer.isIdentityTransformer()) {
+ res.writeContent(respond, cacheFile);
+ if (contentLength < 10485760) { // 10 mb
+ serverFileUtils.copy(cacheFile, hfos);
+ } // else hfos is empty and that means: no work afterwards with it
+ } else {
+ res.writeContent(hfos, cacheFile);
+ }
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
// enQueue new entry with response header
hpc.status = plasmaHTCache.CACHE_FILL;
@@ -711,9 +745,6 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
}
}
-
-
-
public void doHead(Properties conProp, httpHeader requestHeader, OutputStream respond) throws IOException {
String method = conProp.getProperty("METHOD");
String host = conProp.getProperty("HOST");
@@ -834,8 +865,6 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
respond.flush();
}
-
-
public void doConnect(Properties conProp, de.anomic.http.httpHeader requestHeader, InputStream clientIn, OutputStream clientOut) throws IOException {
String host = conProp.getProperty("HOST");
int port = Integer.parseInt(conProp.getProperty("PORT"));
diff --git a/source/de/anomic/plasma/plasmaCrawlLoader.java b/source/de/anomic/plasma/plasmaCrawlLoader.java
index 52aa1f1ff..bc1821b22 100644
--- a/source/de/anomic/plasma/plasmaCrawlLoader.java
+++ b/source/de/anomic/plasma/plasmaCrawlLoader.java
@@ -88,7 +88,7 @@ public class plasmaCrawlLoader {
// we kill that thread
thread.interrupt(); // hopefully this wakes him up.
slots.remove(i);
- System.out.println("CRAWLER: IGNORING SLEEPING DOWNLOAD SLOT " + thread.url.toString());
+ log.logDebug("IGNORING SLEEPING DOWNLOAD SLOT " + thread.url.toString());
}
} else {
// thread i is dead, remove it
@@ -198,31 +198,26 @@ public class plasmaCrawlLoader {
// the transfer is ok
long contentLength = res.responseHeader.contentLength();
- // make a scraper and transformer
- htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
- OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
-
// reserve cache entry
- plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, requestHeader, res.status, res.responseHeader, scraper, initiator, profile);
+ plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, requestHeader, res.status, res.responseHeader, initiator, profile);
// request has been placed and result has been returned. work off response
File cacheFile = cacheManager.getCachePath(url);
try {
if (!(httpd.isTextMime(res.responseHeader.mime().toLowerCase(), acceptMimeTypes))) {
// if the response has not the right file type then reject file
- hfos.close();
remote.close();
- System.out.println("REJECTED WRONG MIME TYPE " + res.responseHeader.mime() + " for url " + url.toString());
+ log.logInfo("REJECTED WRONG MIME TYPE " + res.responseHeader.mime() + " for url " + url.toString());
htCache.status = plasmaHTCache.CACHE_UNFILLED;
} else if ((profile.storeHTCache()) && ((error = htCache.shallStoreCache()) == null)) {
// we write the new cache entry to file system directly
cacheFile.getParentFile().mkdirs();
- res.writeContent(hfos, cacheFile); // writes in content scraper and cache file
+ res.writeContent(htCache.getContentOutputStream(), cacheFile); // writes in content scraper and cache file
htCache.status = plasmaHTCache.CACHE_FILL;
} else {
if (error != null) log.logDebug("CRAWLER NOT STORED RESOURCE " + url.toString() + ": " + error);
// anyway, the content still lives in the content scraper
- res.writeContent(hfos, null); // writes only into content scraper
+ res.writeContent(htCache.getContentOutputStream(), null); // writes only into content scraper
htCache.status = plasmaHTCache.CACHE_PASSING;
}
// enQueue new entry with response header
@@ -240,18 +235,18 @@ public class plasmaCrawlLoader {
// but we clean the cache also, since it may be only partial
// and most possible corrupted
if (cacheFile.exists()) cacheFile.delete();
- System.out.println("CRAWLER LOADER ERROR1: with url=" + url.toString() + ": " + e.toString());
+ log.logError("CRAWLER LOADER ERROR1: with url=" + url.toString() + ": " + e.toString());
}
} else {
// if the response has not the right response type then reject file
- System.out.println("REJECTED WRONG STATUS TYPE '" + res.status + "' for url " + url.toString());
+ log.logInfo("REJECTED WRONG STATUS TYPE '" + res.status + "' for url " + url.toString());
// not processed any further
}
remote.close();
} catch (Exception e) {
// this may happen if the targeted host does not exist or anything with the
// remote server was wrong.
- System.out.println("CRAWLER LOADER ERROR2 with url=" + url.toString() + ": " + e.toString());
+ log.logError("CRAWLER LOADER ERROR2 with url=" + url.toString() + ": " + e.toString());
e.printStackTrace();
}
}
diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java
index f6c809696..f5d5b9ebe 100644
--- a/source/de/anomic/plasma/plasmaHTCache.java
+++ b/source/de/anomic/plasma/plasmaHTCache.java
@@ -427,11 +427,10 @@ public class plasmaHTCache {
public Entry newEntry(Date initDate, int depth, URL url,
httpHeader requestHeader,
String responseStatus, httpHeader responseHeader,
- htmlFilterContentScraper scraper,
String initiator,
plasmaCrawlProfile.entry profile) {
//System.out.println("NEW ENTRY: " + url.toString()); // DEBUG
- return new Entry(initDate, depth, url, requestHeader, responseStatus, responseHeader, scraper, initiator, profile);
+ return new Entry(initDate, depth, url, requestHeader, responseStatus, responseHeader, initiator, profile);
}
public class Entry {
@@ -449,15 +448,17 @@ public class plasmaHTCache {
public String urlString;
public int status; // cache load/hit/stale etc status
public Date lastModified;
- public htmlFilterContentScraper scraper;
public char doctype;
public String language;
public plasmaCrawlProfile.entry profile;
private String initiator;
+ public ByteArrayOutputStream content;
+ public htmlFilterContentScraper scraper;
+
+
public Entry(Date initDate, int depth, URL url,
httpHeader requestHeader,
String responseStatus, httpHeader responseHeader,
- htmlFilterContentScraper scraper,
String initiator,
plasmaCrawlProfile.entry profile) {
@@ -478,7 +479,7 @@ public class plasmaHTCache {
this.requestHeader = requestHeader;
this.responseStatus = responseStatus;
this.responseHeader = responseHeader;
- this.scraper = scraper;
+ this.content = new ByteArrayOutputStream();
this.profile = profile;
this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null: initiator);
@@ -503,8 +504,16 @@ public class plasmaHTCache {
// to be defined later:
this.cacheArray = null;
this.status = CACHE_UNFILLED;
+ this.scraper = null;
}
+ public OutputStream getContentOutputStream() {
+ return (OutputStream) content;
+ }
+ public byte[] getContentBytes() {
+ try { content.flush(); } catch (IOException e) {}
+ return content.toByteArray();
+ }
public String initiator() {
return initiator;
}
@@ -614,8 +623,129 @@ public class plasmaHTCache {
return null;
}
-
- public String shallIndexCache() {
+
+ public boolean shallUseCache() {
+ // decide upon header information if a specific file should be taken from the cache or not
+
+ //System.out.println("SHALL READ CACHE: requestHeader = " + requestHeader.toString() + ", responseHeader = " + responseHeader.toString());
+
+ // -CGI access in request
+ // CGI access makes the page very individual, and therefore not usable in caches
+ if (isPOST(urlString)) return false;
+ if (isCGI(urlString)) return false;
+
+ // -authorization cases in request
+ if (requestHeader.containsKey("AUTHORIZATION")) return false;
+
+ // -ranges in request
+ // we do not cache partial content
+ if ((requestHeader != null) && (requestHeader.containsKey("RANGE"))) return false;
+
+ //Date d1, d2;
+
+ // -if-modified-since in request
+ // The entity has to be transferred only if it has
+ // been modified since the date given by the If-Modified-Since header.
+ if (requestHeader.containsKey("IF-MODIFIED-SINCE")) {
+ // checking this makes only sense if the cached response contains
+ // a Last-Modified field. If the field does not exist, we go the safe way
+ if (!(responseHeader.containsKey("Last-Modified"))) return false;
+ // parse date
+ Date d1, d2;
+ d2 = responseHeader.lastModified(); if (d2 == null) d2 = new Date();
+ d1 = requestHeader.ifModifiedSince(); if (d1 == null) d1 = new Date();
+ // finally, we shall treat the cache as stale if the modification time is after the if-.. time
+ if (d2.after(d1)) return false;
+ }
+
+ boolean isNotPicture = !isPicture(responseHeader);
+
+ // -cookies in request
+ // unfortunately, we should reload in case of a cookie
+ // but we think that pictures can still be considered as fresh
+ if ((requestHeader.containsKey("COOKIE")) && (isNotPicture)) return false;
+
+ // -set-cookie in cached response
+ // this is a similar case as for COOKIE.
+ if ((responseHeader.containsKey("SET-COOKIE")) && (isNotPicture)) return false; // too strong
+ if ((responseHeader.containsKey("SET-COOKIE2")) && (isNotPicture)) return false; // too strong
+
+ // -pragma in cached response
+ // logically, we would not need to care about no-cache pragmas in cached response headers,
+ // because they cannot exist since they are not written to the cache.
+ // So this IF should always fail..
+ if ((responseHeader.containsKey("PRAGMA")) &&
+ (((String) responseHeader.get("Pragma")).toUpperCase().equals("NO-CACHE"))) return false;
+
+ // calculate often needed values for freshness attributes
+ Date date = responseHeader.date();
+ Date expires = responseHeader.expires();
+ Date lastModified = responseHeader.lastModified();
+ String cacheControl = (String) responseHeader.get("Cache-Control");
+
+
+ // see for documentation also:
+ // http://www.web-caching.com/cacheability.html
+ // http://vancouver-webpages.com/CacheNow/
+
+ // look for freshnes information
+ // if we don't have any freshnes indication, we treat the file as stale.
+ // no handle for freshness control:
+ if ((expires == null) && (cacheControl == null) && (lastModified == null)) return false;
+
+ // -expires in cached response
+ // the expires value gives us a very easy hint when the cache is stale
+ if (expires != null) {
+ Date yesterday = new Date((new Date()).getTime() - oneday);
+ if (expires.before(yesterday)) return false;
+ }
+
+ // -lastModified in cached response
+ // we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read
+ // of the file and the last modified date as the age of the file. If we consider the file as
+ // middel-aged then, the maximum TTL would be cache-creation plus age.
+ // This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache
+ // file may only be treated as fresh for one more month, not more.
+ if (lastModified != null) {
+ if (date == null) date = new Date();
+ long age = date.getTime() - lastModified.getTime();
+ if (age < 0) return false;
+ // TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10
+ // the actual living-time is new Date().getTime() - d2.getTime()
+ // therefore the cache is stale, if Date().getTime() - d2.getTime() > age/10
+ if ((new Date()).getTime() - date.getTime() > age / 10) return false;
+ }
+
+ // -cache-control in cached response
+ // the cache-control has many value options.
+ if (cacheControl != null) {
+ cacheControl = cacheControl.trim().toUpperCase();
+ if (cacheControl.startsWith("PUBLIC")) {
+ // ok, do nothing
+ } else if ((cacheControl.startsWith("PRIVATE")) ||
+ (cacheControl.startsWith("NO-CACHE")) ||
+ (cacheControl.startsWith("NO-STORE"))) {
+ // easy case
+ return false;
+ } else if (cacheControl.startsWith("MAX-AGE=")) {
+ // we need also the load date
+ if (date == null) return false;
+ try {
+ long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
+ if ((new Date()).getTime() - date.getTime() > ttl) {
+ return false;
+ }
+ } catch (Exception e) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+ }
+
+
+ public String shallIndexCacheForProxy() {
// decide upon header information if a specific file should be indexed
// this method returns null if the answer is 'YES'!
// if the answer is 'NO' (do not index), it returns a string with the reason
@@ -670,10 +800,8 @@ public class plasmaHTCache {
// thus we do not care about it here for indexing
// -pragma in cached response
- /*
if ((responseHeader.containsKey("PRAGMA")) &&
(((String) responseHeader.get("Pragma")).toUpperCase().equals("NO-CACHE"))) return "Denied_(pragma_no_cache)";
- */
// see for documentation also:
// http://www.web-caching.com/cacheability.html
@@ -732,126 +860,69 @@ public class plasmaHTCache {
return null;
}
-
- public boolean shallUseCache() {
- // decide upon header information if a specific file should be taken from the cache or not
-
- //System.out.println("SHALL READ CACHE: requestHeader = " + requestHeader.toString() + ", responseHeader = " + responseHeader.toString());
+
+
+ public String shallIndexCacheForCrawler() {
+ // decide upon header information if a specific file should be indexed
+ // this method returns null if the answer is 'YES'!
+ // if the answer is 'NO' (do not index), it returns a string with the reason
+ // to reject the crawling demand in clear text
+ // check profile
+ if (!(profile.localIndexing())) return "Indexing_Not_Allowed";
+
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable in caches
- if (isPOST(urlString)) return false;
- if (isCGI(urlString)) return false;
+ if ((isPOST(urlString)) && (!(profile.crawlingQ()))) return "Dynamic_(POST)";
+ if ((isCGI(urlString)) && (!(profile.crawlingQ()))) return "Dynamic_(CGI)";
// -authorization cases in request
- if (requestHeader.containsKey("AUTHORIZATION")) return false;
+ // we checked that in shallStoreCache
// -ranges in request
- // we do not cache partial content
- if ((requestHeader != null) && (requestHeader.containsKey("RANGE"))) return false;
+ // we checked that in shallStoreCache
- //Date d1, d2;
+ // a picture cannot be indexed
+ if (isPicture(responseHeader)) return "Media_Content_(Picture)";
+ if (!(isText(responseHeader))) return "Media_Content_(not_text)";
+ if (noIndexingURL(urlString)) return "Media_Content_(forbidden)";
// -if-modified-since in request
- // The entity has to be transferred only if it has
- // been modified since the date given by the If-Modified-Since header.
- if (requestHeader.containsKey("IF-MODIFIED-SINCE")) {
- // checking this makes only sense if the cached response contains
- // a Last-Modified field. If the field does not exist, we go the safe way
- if (!(responseHeader.containsKey("Last-Modified"))) return false;
- // parse date
- Date d1, d2;
- d2 = responseHeader.lastModified(); if (d2 == null) d2 = new Date();
- d1 = requestHeader.ifModifiedSince(); if (d1 == null) d1 = new Date();
- // finally, we shall treat the cache as stale if the modification time is after the if-.. time
- if (d2.after(d1)) return false;
- }
+ // if the page is fresh at the very moment we can index it
+ // -> this does not apply for the crawler
- boolean isNotPicture = !isPicture(responseHeader);
-
// -cookies in request
- // unfortunately, we should reload in case of a cookie
- // but we think that pictures can still be considered as fresh
- if ((requestHeader.containsKey("COOKIE")) && (isNotPicture)) return false;
-
- // -set-cookie in cached response
- // this is a similar case as for COOKIE.
- if ((responseHeader.containsKey("SET-COOKIE")) && (isNotPicture)) return false; // too strong
- if ((responseHeader.containsKey("SET-COOKIE2")) && (isNotPicture)) return false; // too strong
-
- // -pragma in cached response
- // logically, we would not need to care about no-cache pragmas in cached response headers,
- // because they cannot exist since they are not written to the cache.
- // So this IF should always fail..
- if ((responseHeader.containsKey("PRAGMA")) &&
- (((String) responseHeader.get("Pragma")).toUpperCase().equals("NO-CACHE"))) return false;
-
- // calculate often needed values for freshness attributes
- Date date = responseHeader.date();
- Date expires = responseHeader.expires();
- Date lastModified = responseHeader.lastModified();
- String cacheControl = (String) responseHeader.get("Cache-Control");
-
-
- // see for documentation also:
- // http://www.web-caching.com/cacheability.html
- // http://vancouver-webpages.com/CacheNow/
+ // unfortunately, we cannot index pages which have been requested with a cookie
+ // because the returned content may be special for the client
+ // -> this does not apply for a crawler
+
+ // -set-cookie in response
+ // the set-cookie from the server does not indicate that the content is special
+ // thus we do not care about it here for indexing
+ // -> this does not apply for a crawler
+ // -pragma in cached response
+ // -> in the crawler we ignore this
+
// look for freshnes information
- // if we don't have any freshnes indication, we treat the file as stale.
- // no handle for freshness control:
- if ((expires == null) && (cacheControl == null) && (lastModified == null)) return false;
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
- if (expires != null) {
- Date yesterday = new Date((new Date()).getTime() - oneday);
- if (expires.before(yesterday)) return false;
- }
+ // sometimes, the expires date is set to the past to prevent that a page is cached
+ // we use that information to see if we should index it
+ // -> this does not apply for a crawler
// -lastModified in cached response
- // we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read
- // of the file and the last modified date as the age of the file. If we consider the file as
- // middel-aged then, the maximum TTL would be cache-creation plus age.
- // This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache
- // file may only be treated as fresh for one more month, not more.
- if (lastModified != null) {
- if (date == null) date = new Date();
- long age = date.getTime() - lastModified.getTime();
- if (age < 0) return false;
- // TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10
- // the actual living-time is new Date().getTime() - d2.getTime()
- // therefore the cache is stale, if Date().getTime() - d2.getTime() > age/10
- if ((new Date()).getTime() - date.getTime() > age / 10) return false;
- }
+ // this information is too weak to use it to prevent indexing
+ // even if we can apply a TTL heuristic for cache usage
// -cache-control in cached response
// the cache-control has many value options.
- if (cacheControl != null) {
- cacheControl = cacheControl.trim().toUpperCase();
- if (cacheControl.startsWith("PUBLIC")) {
- // ok, do nothing
- } else if ((cacheControl.startsWith("PRIVATE")) ||
- (cacheControl.startsWith("NO-CACHE")) ||
- (cacheControl.startsWith("NO-STORE"))) {
- // easy case
- return false;
- } else if (cacheControl.startsWith("MAX-AGE=")) {
- // we need also the load date
- if (date == null) return false;
- try {
- long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
- if ((new Date()).getTime() - date.getTime() > ttl) {
- return false;
- }
- } catch (Exception e) {
- return false;
- }
- }
- }
+ // -> in the crawler we ignore this
- return true;
+ return null;
}
+
}
}
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index 747f9c7eb..aafb9e83f 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -59,17 +59,25 @@ public class plasmaParser {
}
- public document parse(URL location, String mimeType, byte[] source) {
+ public document parseSource(URL location, String mimeType, byte[] source) {
// make a scraper and transformer
htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
try {
hfos.write(source);
+ return transformScraper(location, mimeType, scraper);
+ } catch (IOException e) {
+ return null;
+ }
+ }
+
+ public document transformScraper(URL location, String mimeType, htmlFilterContentScraper scraper) {
+ try {
return new document(new URL(urlNormalform(location)),
- mimeType, null, null, scraper.getHeadline(),
- null, null,
- scraper.getText(), scraper.getAnchor(), scraper.getImage());
- } catch (Exception e) {
+ mimeType, null, null, scraper.getHeadline(),
+ null, null,
+ scraper.getText(), scraper.getAnchors(), scraper.getImages());
+ } catch (MalformedURLException e) {
return null;
}
}
@@ -89,8 +97,6 @@ public class plasmaParser {
return us;
}
-
-
public class document {
URL location; // the source url
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 633f4610e..68aa6c746 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -147,6 +147,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
public HashSet extensionBlack;
public HashMap outgoingCookies, incomingCookies;
public kelondroTables facilityDB;
+ public plasmaParser parser;
public int serverJobs;
public boolean terminate = false;
@@ -203,28 +204,10 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
// make crawl profiles database and default profiles
profiles = new plasmaCrawlProfile(new File(plasmaPath, "crawlProfiles0.db"));
-
- //System.out.println("profiles.size=" + profiles.size());
- //System.out.println("profile-config=" + getConfig("defaultProxyProfile", "").length());
- //System.out.println("profile-entry=" + profiles.getEntry(getConfig("defaultProxyProfile", "")).toString());
- if ((profiles.size() == 0) ||
- (getConfig("defaultProxyProfile", "").length() == 0) ||
- (profiles.getEntry(getConfig("defaultProxyProfile", "")) == null)) {
- // generate new default entry for proxy crawling
- defaultProxyProfile = profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), false, true, true, true, false, true, true, true);
- setConfig("defaultProxyProfile", defaultProxyProfile.handle());
- } else {
- defaultProxyProfile = profiles.getEntry(getConfig("defaultProxyProfile", ""));
- }
- if ((profiles.size() == 1) ||
- (getConfig("defaultRemoteProfile", "").length() == 0) ||
- (profiles.getEntry(getConfig("defaultRemoteProfile", "")) == null)) {
- // generate new default entry for proxy crawling
- defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, false, false, true, true, false, true, true, false);
- setConfig("defaultRemoteProfile", defaultRemoteProfile.handle());
- } else {
- defaultRemoteProfile = profiles.getEntry(getConfig("defaultRemoteProfile", ""));
- }
+ initProfiles();
+
+ // make parser
+ parser = new plasmaParser(new File(""));
// start indexing management
loadedURL = new plasmaCrawlLURL(new File(plasmaPath, "urlHash.db"), ramLURL);
@@ -309,14 +292,46 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
this.serverJobs = jobs;
}
+ private void initProfiles() throws IOException {
+ if ((profiles.size() == 0) ||
+ (getConfig("defaultProxyProfile", "").length() == 0) ||
+ (profiles.getEntry(getConfig("defaultProxyProfile", "")) == null)) {
+ // generate new default entry for proxy crawling
+ defaultProxyProfile = profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), false, true, true, true, false, true, true, true);
+ setConfig("defaultProxyProfile", defaultProxyProfile.handle());
+ } else {
+ defaultProxyProfile = profiles.getEntry(getConfig("defaultProxyProfile", ""));
+ }
+ if ((profiles.size() == 1) ||
+ (getConfig("defaultRemoteProfile", "").length() == 0) ||
+ (profiles.getEntry(getConfig("defaultRemoteProfile", "")) == null)) {
+ // generate new default entry for proxy crawling
+ defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, false, false, true, true, false, true, true, false);
+ setConfig("defaultRemoteProfile", defaultRemoteProfile.handle());
+ } else {
+ defaultRemoteProfile = profiles.getEntry(getConfig("defaultRemoteProfile", ""));
+ }
+ }
+ private void resetProfiles() {
+ File pdb = new File(plasmaPath, "crawlProfiles0.db");
+ if (pdb.exists()) pdb.delete();
+ try {
+ profiles = new plasmaCrawlProfile(pdb);
+ initProfiles();
+ } catch (IOException e) {}
+ }
private void cleanProfiles() {
if (totalSize() > 0) return;
Iterator i = profiles.profiles(true);
plasmaCrawlProfile.entry entry;
- while (i.hasNext()) {
- entry = (plasmaCrawlProfile.entry) i.next();
- if (!((entry.name().equals("proxy")) || (entry.name().equals("remote")))) i.remove();
- }
+ try {
+ while (i.hasNext()) {
+ entry = (plasmaCrawlProfile.entry) i.next();
+ if (!((entry.name().equals("proxy")) || (entry.name().equals("remote")))) i.remove();
+ }
+ } catch (kelondroException e) {
+ resetProfiles();
+ }
}
public plasmaHTCache getCacheManager() {
@@ -454,7 +469,8 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
private synchronized void processResourceStack(plasmaHTCache.Entry entry) {
// work off one stack entry with a fresh resource (scraped web page)
- if (entry.scraper != null) try {
+ byte[] content;
+ if (((content = entry.getContentBytes()).length > 0) || (entry.scraper != null)) try {
// we must distinguish the following cases: resource-load was initiated by
// 1) global crawling: the index is extern, not here (not possible here)
// 2) result of search queries, some indexes are here (not possible here)
@@ -479,10 +495,20 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
log.logDebug("processResourceStack: processCase=" + processCase + ", depth=" + entry.depth + ", maxDepth=" + entry.profile.generalDepth() + ", filter=" + entry.profile.generalFilter() + ", initiatorHash=" + initiatorHash + ", status=" + entry.status + ", url=" + entry.url); // DEBUG
+ // parse content
+ plasmaParser.document document;
+ if (entry.scraper != null) {
+ log.logDebug("(Parser) '" + entry.urlString + "' is pre-parsed by scraper");
+ document = parser.transformScraper(entry.url, entry.responseHeader.mime(), entry.scraper);
+ } else {
+ log.logDebug("(Parser) '" + entry.urlString + "' is not parsed, parsing now");
+ document = parser.parseSource(entry.url, entry.responseHeader.mime(), content);
+ }
+
// put anchors on crawl stack
if (((processCase == 4) || (processCase == 5)) &&
(entry.depth < entry.profile.generalDepth())) {
- Map hl = entry.scraper.getHyperlinks();
+ Map hl = document.getHyperlinks();
Iterator i = hl.entrySet().iterator();
String nexturlstring;
String rejectReason;
@@ -500,18 +526,26 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
}
}
log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.url.toString() +
- ", NEW CRAWL STACK SIZE IS " + noticeURL.localStackSize());
+ ", NEW CRAWL STACK SIZE IS " + noticeURL.localStackSize());
}
// create index
- String noIndexReason;
- String descr = entry.scraper.getHeadline();
+
+ String descr = document.getMainLongTitle();
URL referrerURL = entry.referrerURL();
String referrerHash = (referrerURL == null) ? plasmaURL.dummyHash : plasmaURL.urlHash(referrerURL);
- if ((noIndexReason = entry.shallIndexCache()) == null ) {
+ String noIndexReason = "unspecified";
+ if (processCase == 4) {
+ // proxy-load
+ noIndexReason = entry.shallIndexCacheForProxy();
+ } else {
+ // normal crawling
+ noIndexReason = entry.shallIndexCacheForCrawler();
+ }
+ if (noIndexReason == null) {
// strip out words
log.logDebug("(Profile) Condensing for '" + entry.urlString + "'");
- plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(entry.scraper.getText()));
+ plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(document.getText()));
//log.logInfo("INDEXING HEADLINE:" + descr);
try {
@@ -573,7 +607,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
}
// explicit delete/free resources
- entry.scraper = null; entry = null;
+ document = null; entry = null;
} catch (IOException e) {
log.logError("ERROR in plasmaSwitchboard.process(): " + e.toString());
}
@@ -1310,6 +1344,10 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
log.logError("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage());
e.printStackTrace();
return new plasmaWordIndexEntity[0];
+ } catch (kelondroException e) {
+ log.logError("selectTransferIndexes database corrupted: " + e.getMessage());
+ e.printStackTrace();
+ return new plasmaWordIndexEntity[0];
}
}
diff --git a/source/de/anomic/plasma/plasmaWordIndexFileCache.java b/source/de/anomic/plasma/plasmaWordIndexFileCache.java
index 517686efb..c714ce6f7 100644
--- a/source/de/anomic/plasma/plasmaWordIndexFileCache.java
+++ b/source/de/anomic/plasma/plasmaWordIndexFileCache.java
@@ -62,6 +62,7 @@ package de.anomic.plasma;
import java.io.*;
import java.util.*;
+import de.anomic.server.*;
import de.anomic.kelondro.*;
public class plasmaWordIndexFileCache {
@@ -72,24 +73,43 @@ public class plasmaWordIndexFileCache {
// class variables
private File databaseRoot;
private kelondroTree indexCache;
+ private int bufferkb;
public plasmaWordIndexFileCache(File databaseRoot, int bufferkb) throws IOException {
this.databaseRoot = databaseRoot;
+ this.bufferkb = bufferkb;
File indexCacheFile = new File(databaseRoot, indexCacheFileName);
if (indexCacheFile.exists()) {
// simply open the file
indexCache = new kelondroTree(indexCacheFile, bufferkb * 0x400);
} else {
- // create a new file
- int[] columns = new int[buffers + 2];
- columns[0] = plasmaWordIndexEntry.wordHashLength;
- columns[1] = 1;
- for (int i = 0; i < buffers; i++) columns[i + 2] = plasmaCrawlLURL.urlHashLength + plasmaWordIndexEntry.attrSpaceShort;
- indexCache = new kelondroTree(indexCacheFile, bufferkb * 0x400, columns);
+ createCacheFile(indexCacheFile);
}
}
-
+ private void resetCacheFile() {
+ // this has to be used in emergencies only
+ // it can happen that there is a serious db inconsistency; in that case we re-create the indexCache
+ try { indexCache.close(); } catch (IOException e) {}
+ File indexCacheFile = new File(databaseRoot, indexCacheFileName);
+ if (indexCacheFile.exists()) indexCacheFile.delete();
+ try {
+ createCacheFile(indexCacheFile);
+ } catch (IOException e) {
+ de.anomic.server.serverLog.logError("PLASMA", "plasmaWordIndexFileCache.resetCacheFile: serious failure creating the cache file: " + e.getMessage());
+ indexCache = null;
+ }
+ }
+
+ private void createCacheFile(File indexCacheFile) throws IOException {
+ // create a new file
+ int[] columns = new int[buffers + 2];
+ columns[0] = plasmaWordIndexEntry.wordHashLength;
+ columns[1] = 1;
+ for (int i = 0; i < buffers; i++) columns[i + 2] = plasmaCrawlLURL.urlHashLength + plasmaWordIndexEntry.attrSpaceShort;
+ indexCache = new kelondroTree(indexCacheFile, bufferkb * 0x400, columns);
+ }
+
protected void close() throws IOException {
indexCache.close();
indexCache = null;
@@ -162,8 +182,12 @@ public class plasmaWordIndexFileCache {
indexCache.put(row);
} catch (kelondroException e) {
// this is a very bad case; a database inconsistency occurred
- deleteComplete(wordHash);
- System.out.println("fatal error in plasmaWordIndexFileCacle.addEntriesToIndex: write to word hash file " + wordHash + " failed - " + e.getMessage() + " - index deleted.");
+ serverLog.logError("PLASMA", "fatal error in plasmaWordIndexFileCache.addEntriesToIndex: write of " + wordHash + " to index cache failed - " + e.getMessage() + " - indexCache.db deleted");
+ resetCacheFile();
+ } catch (IOException e) {
+ // this is a very bad case; a database inconsistency occurred
+ serverLog.logError("PLASMA", "fatal error in plasmaWordIndexFileCache.addEntriesToIndex: write of " + wordHash + " to index cache failed - " + e.getMessage() + " - indexCache.db deleted");
+ resetCacheFile();
}
}
// finished!
diff --git a/source/yacy.java b/source/yacy.java
index 51e2d4c15..eca12829c 100644
--- a/source/yacy.java
+++ b/source/yacy.java
@@ -208,7 +208,7 @@ public class yacy {
}
// init parser
- de.anomic.htmlFilter.htmlFilterContentScraper.mediaExt = sb.getConfig("mediaExt","");
+ de.anomic.plasma.plasmaParser.mediaExt = sb.getConfig("mediaExt","");
// start main threads
try {
diff --git a/yacy.blue b/yacy.blue
new file mode 100644
index 000000000..d2833eb81
--- /dev/null
+++ b/yacy.blue
@@ -0,0 +1 @@
+testblue
diff --git a/yacy.init b/yacy.init
index f0f62fe49..97b01a63b 100644
--- a/yacy.init
+++ b/yacy.init
@@ -166,16 +166,8 @@ remoteProxyUse=false
#remoteProxyUse=true
# the proxy may filter the content of transferred web pages
-# this is archieved using a special filtering class that can be
-# exchanged like a transformation plug-in
-# If you want to do this, you must implement the htmlFilterTransformer
-# -Interface and set the name of the implementing class here.
-# As a default, we use a filtering Transformer that takes a blacklist
-# and blocks all text fragments where a word from the blacklist appears
-# as the blacklist, we use the search-engine's blue-list
-# please see that class as an implementation example for your own transformers
-pageTransformerClass=htmlFilterContentTransformer
-pageTransformerArg=yacy.blue
+# the bluelist removes specific keywords from web pages
+proxyBlueList=yacy.blue
# security settigns
# we provide proxy and server security through a 2-stage security gate: