From e7d055b98e20292d79cd59c733719ed943bc97df Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 13 Apr 2005 15:52:00 +0000 Subject: [PATCH] very experimental integration of the new generic parser and optional disabling of bluelist filtering in proxy. Does not yet work properly. To disable the disable-feature, the presence of a non-empty bluelist is necessary git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@17 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CacheAdmin_p.java | 7 +- htroot/IndexCreate_p.java | 2 +- .../htmlFilter/htmlFilterContentScraper.java | 188 +----------- .../htmlFilterContentTransformer.java | 6 + .../htmlFilter/htmlFilterTransformer.java | 5 + source/de/anomic/http/httpdProxyHandler.java | 111 ++++--- .../de/anomic/plasma/plasmaCrawlLoader.java | 21 +- source/de/anomic/plasma/plasmaHTCache.java | 281 +++++++++++------- source/de/anomic/plasma/plasmaParser.java | 20 +- .../de/anomic/plasma/plasmaSwitchboard.java | 106 ++++--- .../plasma/plasmaWordIndexFileCache.java | 42 ++- source/yacy.java | 2 +- yacy.blue | 1 + yacy.init | 12 +- 14 files changed, 404 insertions(+), 400 deletions(-) create mode 100644 yacy.blue diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java index 6f23c5b77..702244c6e 100644 --- a/htroot/CacheAdmin_p.java +++ b/htroot/CacheAdmin_p.java @@ -114,11 +114,12 @@ public class CacheAdmin_p { else { htmlFilterContentScraper scraper = new htmlFilterContentScraper(url); OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); + plasmaParser.document document = switchboard.parser.transformScraper(url, "text/html", scraper); serverFileUtils.copy(file, os); info += "HEADLINE:
" + scraper.getHeadline() + "

"; - info += "HREF:
" + formatAnchor(scraper.getHyperlinks()) + "
"; - info += "MEDIA:
" + formatAnchor(scraper.getMedialinks()) + "
"; - info += "EMAIL:
" + formatAnchor(scraper.getEmaillinks()) + "
"; + info += "HREF:
" + formatAnchor(document.getHyperlinks()) + "
"; + info += "MEDIA:
" + formatAnchor(document.getMedialinks()) + "
"; + info += "EMAIL:
" + formatAnchor(document.getEmaillinks()) + "
"; info += "TEXT:
" + new String(scraper.getText()) + "
"; } } catch (Exception e) { diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index ef147f490..e2be76047 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -305,7 +305,7 @@ public class IndexCreate_p { prop.put("indexing-queue_list_"+i+"_initiator", ((initiator == null) ? "proxy" : initiator.getName())); prop.put("indexing-queue_list_"+i+"_depth", pcentry.depth); prop.put("indexing-queue_list_"+i+"_modified", daydate(pcentry.lastModified)); - prop.put("indexing-queue_list_"+i+"_href",((pcentry.scraper == null) ? "0" : ("" + pcentry.scraper.getHyperlinks().size()))); + prop.put("indexing-queue_list_"+i+"_href",((pcentry.scraper == null) ? "0" : ("" + pcentry.scraper.getAnchors().size()))); prop.put("indexing-queue_list_"+i+"_anchor", ((pcentry.scraper == null) ? "-" : pcentry.scraper.getHeadline()) ); prop.put("indexing-queue_list_"+i+"_url", pcentry.urlString); dark = !dark; diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 48ca7c8d7..c3b7343e0 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -52,10 +52,6 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen private static HashSet linkTags0; private static HashSet linkTags1; - public static String mediaExt = - "swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," + - "sit,hqx,img,dmg,tar,gz,ps,pdf,doc,xls,ppt,ram,bz2,arj"; - static { linkTags0 = new HashSet(); linkTags0.add("img"); @@ -67,8 +63,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen } // class variables: collectors for links - private Properties anchor; - private Properties image; + private HashMap anchors; + private HashMap images; private String title; private String headline; private serverByteBuffer text; @@ -79,8 +75,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen // it is only the reference for relative links super(linkTags0, linkTags1); this.root = root; - this.anchor = new Properties(); - this.image = new Properties(); + this.anchors = new HashMap(); + this.images = new HashMap(); this.title = ""; this.headline = ""; this.text = new serverByteBuffer(); @@ -117,12 +113,12 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen } public void scrapeTag0(String tagname, Properties tagopts) { - if (tagname.equals("img")) image.setProperty(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt","")); + if (tagname.equals("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt","")); } public void scrapeTag1(String tagname, Properties tagopts, byte[] text) { //System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text)); - if (tagname.equals("a")) anchor.setProperty(absolutePath(tagopts.getProperty("href", "")), + if (tagname.equals("a")) anchors.put(absolutePath(tagopts.getProperty("href", "")), new serverByteBuffer(super.stripAll(new serverByteBuffer(text)).getBytes()).trim().toString()); if (tagname.equals("h1")) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes()); if (tagname.equals("title")) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes()); @@ -153,179 +149,19 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen return text.getBytes(); } - public Properties getAnchor() { - return anchor; - } - - public Properties getImage() { - return image; - } - - public Map getHyperlinks() { - if (hyperlinks == null) resortLinks(); - return hyperlinks; - } - - public Map getMedialinks() { - if (medialinks == null) resortLinks(); - return medialinks; - } - - public Map getEmaillinks() { - if (emaillinks == null) resortLinks(); - return emaillinks; - } - - HashMap hyperlinks = null; - HashMap medialinks = null; - HashMap emaillinks = null; - - private synchronized void resortLinks() { - Iterator i; - String url; - int extpos; - String ext; - i = anchor.entrySet().iterator(); - hyperlinks = new HashMap(); - medialinks = new HashMap(); - emaillinks = new HashMap(); - Map.Entry entry; - while (i.hasNext()) { - entry = (Map.Entry) i.next(); - url = (String) entry.getKey(); - if ((url != null) && (url.startsWith("mailto:"))) { - emaillinks.put(url.substring(7), entry.getValue()); - } else { - extpos = url.lastIndexOf("."); - String normal; - if (extpos > 0) { - ext = url.substring(extpos).toLowerCase(); - normal = urlNormalform(url); - if (normal != null) { - if (mediaExt.indexOf(ext.substring(1)) >= 0) { - // this is not an normal anchor, its a media link - medialinks.put(normal, entry.getValue()); - } else { - hyperlinks.put(normal, entry.getValue()); - } - } - } - } - } - // finally add the images to the medialinks - i = image.entrySet().iterator(); - String normal; - while (i.hasNext()) { - entry = (Map.Entry) i.next(); - url = (String) entry.getKey(); - normal = urlNormalform(url); - if (normal != null) medialinks.put(normal, entry.getValue()); // avoid NullPointerException - } - expandHyperlinks(); - } - - /* - private synchronized void resortLinks() { - Enumeration e; - String url; - int extpos; - String ext; - e = anchor.propertyNames(); - hyperlinks = new Properties(); - medialinks = new Properties(); - emaillinks = new Properties(); - while (e.hasMoreElements()) { - url = (String) e.nextElement(); - if ((url != null) && (url.startsWith("mailto:"))) { - emaillinks.setProperty(url.substring(7), anchor.getProperty(url)); - } else { - extpos = url.lastIndexOf("."); - String normal; - if (extpos > 0) { - ext = url.substring(extpos).toLowerCase(); - normal = urlNormalform(url); - if (normal != null) { - if (mediaExt.indexOf(ext.substring(1)) >= 0) { - // this is not an normal anchor, its a media link - medialinks.setProperty(normal, anchor.getProperty(url)); - } else { - hyperlinks.setProperty(normal, anchor.getProperty(url)); - } - } - } - } - } - // finally add the images to the medialinks - e = image.propertyNames(); - String normal; - while (e.hasMoreElements()) { - url = (String) e.nextElement(); - normal = urlNormalform(url); - if (normal != null) medialinks.setProperty(normal, image.getProperty(url)); // avoid NullPointerException - } + public Map getAnchors() { + return anchors; } -*/ - public synchronized void expandHyperlinks() { - // we add artificial hyperlinks to the hyperlink set that can be calculated from - // given hyperlinks and imagelinks - hyperlinks.putAll(allReflinks(hyperlinks)); - hyperlinks.putAll(allReflinks(medialinks)); - hyperlinks.putAll(allSubpaths(hyperlinks)); - hyperlinks.putAll(allSubpaths(medialinks)); + public Map getImages() { + return images; } - private static Map allReflinks(Map links) { - // we find all links that are part of a reference inside a url - HashMap v = new HashMap(); - Iterator i = links.keySet().iterator(); - String s; - int pos; - loop: while (i.hasNext()) { - s = (String) i.next(); - if ((pos = s.toLowerCase().indexOf("http://",7)) > 0) { - i.remove(); - s = s.substring(pos); - while ((pos = s.toLowerCase().indexOf("http://",7)) > 0) s = s.substring(pos); - if (!(v.containsKey(s))) v.put(s, "ref"); - continue loop; - } - if ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) { - i.remove(); - s = "http:/" + s.substring(pos); - while ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) s = "http:/" + s.substring(pos); - if (!(v.containsKey(s))) v.put(s, "ref"); - continue loop; - } - } - return v; - } - - private static Map allSubpaths(Map links) { - HashMap v = new HashMap(); - Iterator i = links.keySet().iterator(); - String s; - int pos; - while (i.hasNext()) { - s = (String) i.next(); - if (s.endsWith("/")) s = s.substring(0, s.length() - 1); - pos = s.lastIndexOf("/"); - while (pos > 8) { - s = s.substring(0, pos + 1); - if (!(v.containsKey(s))) v.put(s, "sub"); - s = s.substring(0, pos); - pos = s.lastIndexOf("/"); - } - } - return v; - } - - public void print() { System.out.println("TITLE :" + title); System.out.println("HEADLINE:" + headline); - System.out.println("ANCHORS :" + anchor.toString()); - System.out.println("IMAGES :" + image.toString()); + System.out.println("ANCHORS :" + anchors.toString()); + System.out.println("IMAGES :" + images.toString()); System.out.println("TEXT :" + new String(text.getBytes())); } diff --git a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java index 6d187d877..4e2baa5ee 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java @@ -65,6 +65,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer } public void init(String initarg) { + System.out.println("Transformer init: " + initarg); if (bluelist == null) { // here, the initarg is used to load a list of bluelisted words bluelist = new Vector(); @@ -78,9 +79,14 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer r.close(); } catch (Exception e) { } + if (bluelist.size() == 0) System.out.println("BLUELIST is empty"); } } + public boolean isIdentityTransformer() { + return bluelist.size() == 0; + } + private static byte[] genBlueLetters(int length) { serverByteBuffer bb = new serverByteBuffer(" ".getBytes()); length = length / 2; diff --git a/source/de/anomic/htmlFilter/htmlFilterTransformer.java b/source/de/anomic/htmlFilter/htmlFilterTransformer.java index 816cf8138..bc68336ed 100644 --- a/source/de/anomic/htmlFilter/htmlFilterTransformer.java +++ b/source/de/anomic/htmlFilter/htmlFilterTransformer.java @@ -49,6 +49,11 @@ public interface htmlFilterTransformer { // more specific transformation rules public void init(String initarg); + // ask if this transformer will do any transformation whatsoever + // this may return true if the initialization resultet in a status + // that does not allow any transformation + public boolean isIdentityTransformer(); + // tests, if a given body-less tag (i.e.
shall be supervised) // only tags that are defined here will be cached and not streamed public boolean isTag0(String tag); diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index 7cbd696ee..6efb0ec6d 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -125,15 +125,9 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand if (!(htRootPath.exists())) htRootPath.mkdir(); } - // load a transformer - try { - ClassLoader cp = new serverClassLoader(this.getClass().getClassLoader()); - Class transformerClass = cp.loadClass(switchboard.getConfig("pageTransformerClass", "")); - transformer = (htmlFilterTransformer) transformerClass.newInstance(); - transformer.init(switchboard.getConfig("pageTransformerArg", "")); // this is usually the blueList - } catch (Exception e) { - transformer = null; - } + // load a transformer + transformer = new htmlFilterContentTransformer(); + transformer.init(new File(switchboard.getRootPath(), switchboard.getConfig("plasmaBlueList", "")).toString()); String f; // load the yellow-list @@ -396,7 +390,7 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand if (cacheExists) { // we respond on the request by using the cache - hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK", cachedResponseHeader, null, null, switchboard.defaultProxyProfile); + hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK", cachedResponseHeader, null, switchboard.defaultProxyProfile); if (hpc.shallUseCache()) { // the cache is fresh @@ -426,7 +420,8 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand respondHeader(respond, "203 OK", cachedResponseHeader); // respond with 'non-authoritative' // make a transformer - if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) && + if ((!(transformer.isIdentityTransformer())) && + ((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) && ((cachedResponseHeader == null) || (httpd.isTextMime(cachedResponseHeader.mime(), switchboard.mimeWhite)))) { hfos = new htmlFilterOutputStream(respond, null, transformer, (ext.length() == 0)); } else { @@ -472,24 +467,30 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand res = remote.GET(remotePath, requestHeader); long contentLength = res.responseHeader.contentLength(); - // make a scraper and transformer + // reserver cache entry + hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile); + + // make a scraper and transformer if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) && (httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) { - scraper = new htmlFilterContentScraper(url); - hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0)); - if (((htmlFilterOutputStream) hfos).binarySuspect()) { - scraper = null; // forget it, may be rubbish - log.logDebug("Content of " + url + " is probably binary. deleted scraper."); + if (transformer.isIdentityTransformer()) { + hfos = hpc.getContentOutputStream(); + } else { + scraper = new htmlFilterContentScraper(url); + hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0)); + if (((htmlFilterOutputStream) hfos).binarySuspect()) { + scraper = null; // forget it, may be rubbish + log.logDebug("Content of " + url + " is probably binary. deleted scraper."); + } + hpc.scraper = scraper; } } else { log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped"); scraper = null; hfos = respond; + hpc.scraper = scraper; } - // reserver cache entry - hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, scraper, null, switchboard.defaultProxyProfile); - // handle incoming cookies handleIncomingCookies(res.responseHeader, host, ip); @@ -502,7 +503,13 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand if ((contentLength > 0) && // known (contentLength < 1048576)) // 1 MB { - byte[] cacheArray = res.writeContent(hfos); + byte[] cacheArray; + if (transformer.isIdentityTransformer()) { + res.writeContentX(hfos, respond); + cacheArray = hpc.getContentBytes(); + } else { + cacheArray = res.writeContent(hfos); + } if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); // before we came here we deleted a cache entry if (sizeBeforeDelete == cacheArray.length) { @@ -514,8 +521,16 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand cacheManager.stackProcess(hpc, cacheArray); // necessary update, write response header to cache } } else { + // the file is too big to cache it in the ram, write to file cacheFile.getParentFile().mkdirs(); - res.writeContent(hfos, cacheFile); + if (transformer.isIdentityTransformer()) { + res.writeContent(respond, cacheFile); + if (contentLength < 10485760) { // 10 mb + serverFileUtils.copy(cacheFile, hfos); + } // else hfos is empty and that means: no work afterwards with it + } else { + res.writeContent(hfos, cacheFile); + } if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); // before we came here we deleted a cache entry if (sizeBeforeDelete == cacheFile.length()) { @@ -579,24 +594,30 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand httpc.response res = remote.GET(remotePath, requestHeader); long contentLength = res.responseHeader.contentLength(); - // make a scraper and transformer + // reserve cache entry + hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile); + + // make a scraper and transformer if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) && (httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) { - scraper = new htmlFilterContentScraper(url); - hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0)); - if (((htmlFilterOutputStream) hfos).binarySuspect()) { - scraper = null; // forget it, may be rubbish - log.logDebug("Content of " + url + " is probably binary. deleted scraper."); + if (transformer.isIdentityTransformer()) { + hfos = hpc.getContentOutputStream(); + } else { + scraper = new htmlFilterContentScraper(url); + hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0)); + if (((htmlFilterOutputStream) hfos).binarySuspect()) { + scraper = null; // forget it, may be rubbish + log.logDebug("Content of " + url + " is probably binary. deleted scraper."); + } + hpc.scraper = scraper; } - } else { + } else { log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped"); scraper = null; hfos = respond; + hpc.scraper = scraper; } - // reserve cache entry - hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, scraper, null, switchboard.defaultProxyProfile); - // handle incoming cookies handleIncomingCookies(res.responseHeader, host, ip); @@ -608,16 +629,29 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand if ((storeError = hpc.shallStoreCache()) == null) { // we write a new cache entry if ((contentLength > 0) && (contentLength < 1048576)) { - // write to buffer - byte[] cacheArray = res.writeContent(hfos); - if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); + // write to buffer + byte[] cacheArray; + if (transformer.isIdentityTransformer()) { + res.writeContentX(hfos, respond); + cacheArray = hpc.getContentBytes(); + } else { + cacheArray = res.writeContent(hfos); + } + if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); // enQueue new entry with response header and file as byte[] hpc.status = plasmaHTCache.CACHE_FILL; cacheManager.stackProcess(hpc, cacheArray); } else try { // write to file system directly cacheFile.getParentFile().mkdirs(); - res.writeContent(hfos, cacheFile); + if (transformer.isIdentityTransformer()) { + res.writeContent(respond, cacheFile); + if (contentLength < 10485760) { // 10 mb + serverFileUtils.copy(cacheFile, hfos); + } // else hfos is empty and that means: no work afterwards with it + } else { + res.writeContent(hfos, cacheFile); + } if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); // enQueue new entry with response header hpc.status = plasmaHTCache.CACHE_FILL; @@ -711,9 +745,6 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand } } - - - public void doHead(Properties conProp, httpHeader requestHeader, OutputStream respond) throws IOException { String method = conProp.getProperty("METHOD"); String host = conProp.getProperty("HOST"); @@ -834,8 +865,6 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand respond.flush(); } - - public void doConnect(Properties conProp, de.anomic.http.httpHeader requestHeader, InputStream clientIn, OutputStream clientOut) throws IOException { String host = conProp.getProperty("HOST"); int port = Integer.parseInt(conProp.getProperty("PORT")); diff --git a/source/de/anomic/plasma/plasmaCrawlLoader.java b/source/de/anomic/plasma/plasmaCrawlLoader.java index 52aa1f1ff..bc1821b22 100644 --- a/source/de/anomic/plasma/plasmaCrawlLoader.java +++ b/source/de/anomic/plasma/plasmaCrawlLoader.java @@ -88,7 +88,7 @@ public class plasmaCrawlLoader { // we kill that thread thread.interrupt(); // hopefully this wakes him up. slots.remove(i); - System.out.println("CRAWLER: IGNORING SLEEPING DOWNLOAD SLOT " + thread.url.toString()); + log.logDebug("IGNORING SLEEPING DOWNLOAD SLOT " + thread.url.toString()); } } else { // thread i is dead, remove it @@ -198,31 +198,26 @@ public class plasmaCrawlLoader { // the transfer is ok long contentLength = res.responseHeader.contentLength(); - // make a scraper and transformer - htmlFilterContentScraper scraper = new htmlFilterContentScraper(url); - OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false); - // reserve cache entry - plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, requestHeader, res.status, res.responseHeader, scraper, initiator, profile); + plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, requestHeader, res.status, res.responseHeader, initiator, profile); // request has been placed and result has been returned. work off response File cacheFile = cacheManager.getCachePath(url); try { if (!(httpd.isTextMime(res.responseHeader.mime().toLowerCase(), acceptMimeTypes))) { // if the response has not the right file type then reject file - hfos.close(); remote.close(); - System.out.println("REJECTED WRONG MIME TYPE " + res.responseHeader.mime() + " for url " + url.toString()); + log.logInfo("REJECTED WRONG MIME TYPE " + res.responseHeader.mime() + " for url " + url.toString()); htCache.status = plasmaHTCache.CACHE_UNFILLED; } else if ((profile.storeHTCache()) && ((error = htCache.shallStoreCache()) == null)) { // we write the new cache entry to file system directly cacheFile.getParentFile().mkdirs(); - res.writeContent(hfos, cacheFile); // writes in content scraper and cache file + res.writeContent(htCache.getContentOutputStream(), cacheFile); // writes in content scraper and cache file htCache.status = plasmaHTCache.CACHE_FILL; } else { if (error != null) log.logDebug("CRAWLER NOT STORED RESOURCE " + url.toString() + ": " + error); // anyway, the content still lives in the content scraper - res.writeContent(hfos, null); // writes only into content scraper + res.writeContent(htCache.getContentOutputStream(), null); // writes only into content scraper htCache.status = plasmaHTCache.CACHE_PASSING; } // enQueue new entry with response header @@ -240,18 +235,18 @@ public class plasmaCrawlLoader { // but we clean the cache also, since it may be only partial // and most possible corrupted if (cacheFile.exists()) cacheFile.delete(); - System.out.println("CRAWLER LOADER ERROR1: with url=" + url.toString() + ": " + e.toString()); + log.logError("CRAWLER LOADER ERROR1: with url=" + url.toString() + ": " + e.toString()); } } else { // if the response has not the right response type then reject file - System.out.println("REJECTED WRONG STATUS TYPE '" + res.status + "' for url " + url.toString()); + log.logInfo("REJECTED WRONG STATUS TYPE '" + res.status + "' for url " + url.toString()); // not processed any further } remote.close(); } catch (Exception e) { // this may happen if the targeted host does not exist or anything with the // remote server was wrong. - System.out.println("CRAWLER LOADER ERROR2 with url=" + url.toString() + ": " + e.toString()); + log.logError("CRAWLER LOADER ERROR2 with url=" + url.toString() + ": " + e.toString()); e.printStackTrace(); } } diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index f6c809696..f5d5b9ebe 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -427,11 +427,10 @@ public class plasmaHTCache { public Entry newEntry(Date initDate, int depth, URL url, httpHeader requestHeader, String responseStatus, httpHeader responseHeader, - htmlFilterContentScraper scraper, String initiator, plasmaCrawlProfile.entry profile) { //System.out.println("NEW ENTRY: " + url.toString()); // DEBUG - return new Entry(initDate, depth, url, requestHeader, responseStatus, responseHeader, scraper, initiator, profile); + return new Entry(initDate, depth, url, requestHeader, responseStatus, responseHeader, initiator, profile); } public class Entry { @@ -449,15 +448,17 @@ public class plasmaHTCache { public String urlString; public int status; // cache load/hit/stale etc status public Date lastModified; - public htmlFilterContentScraper scraper; public char doctype; public String language; public plasmaCrawlProfile.entry profile; private String initiator; + public ByteArrayOutputStream content; + public htmlFilterContentScraper scraper; + + public Entry(Date initDate, int depth, URL url, httpHeader requestHeader, String responseStatus, httpHeader responseHeader, - htmlFilterContentScraper scraper, String initiator, plasmaCrawlProfile.entry profile) { @@ -478,7 +479,7 @@ public class plasmaHTCache { this.requestHeader = requestHeader; this.responseStatus = responseStatus; this.responseHeader = responseHeader; - this.scraper = scraper; + this.content = new ByteArrayOutputStream(); this.profile = profile; this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null: initiator); @@ -503,8 +504,16 @@ public class plasmaHTCache { // to be defined later: this.cacheArray = null; this.status = CACHE_UNFILLED; + this.scraper = null; } + public OutputStream getContentOutputStream() { + return (OutputStream) content; + } + public byte[] getContentBytes() { + try { content.flush(); } catch (IOException e) {} + return content.toByteArray(); + } public String initiator() { return initiator; } @@ -614,8 +623,129 @@ public class plasmaHTCache { return null; } - - public String shallIndexCache() { + + public boolean shallUseCache() { + // decide upon header information if a specific file should be taken from the cache or not + + //System.out.println("SHALL READ CACHE: requestHeader = " + requestHeader.toString() + ", responseHeader = " + responseHeader.toString()); + + // -CGI access in request + // CGI access makes the page very individual, and therefore not usable in caches + if (isPOST(urlString)) return false; + if (isCGI(urlString)) return false; + + // -authorization cases in request + if (requestHeader.containsKey("AUTHORIZATION")) return false; + + // -ranges in request + // we do not cache partial content + if ((requestHeader != null) && (requestHeader.containsKey("RANGE"))) return false; + + //Date d1, d2; + + // -if-modified-since in request + // The entity has to be transferred only if it has + // been modified since the date given by the If-Modified-Since header. + if (requestHeader.containsKey("IF-MODIFIED-SINCE")) { + // checking this makes only sense if the cached response contains + // a Last-Modified field. If the field does not exist, we go the safe way + if (!(responseHeader.containsKey("Last-Modified"))) return false; + // parse date + Date d1, d2; + d2 = responseHeader.lastModified(); if (d2 == null) d2 = new Date(); + d1 = requestHeader.ifModifiedSince(); if (d1 == null) d1 = new Date(); + // finally, we shall treat the cache as stale if the modification time is after the if-.. time + if (d2.after(d1)) return false; + } + + boolean isNotPicture = !isPicture(responseHeader); + + // -cookies in request + // unfortunately, we should reload in case of a cookie + // but we think that pictures can still be considered as fresh + if ((requestHeader.containsKey("COOKIE")) && (isNotPicture)) return false; + + // -set-cookie in cached response + // this is a similar case as for COOKIE. + if ((responseHeader.containsKey("SET-COOKIE")) && (isNotPicture)) return false; // too strong + if ((responseHeader.containsKey("SET-COOKIE2")) && (isNotPicture)) return false; // too strong + + // -pragma in cached response + // logically, we would not need to care about no-cache pragmas in cached response headers, + // because they cannot exist since they are not written to the cache. + // So this IF should always fail.. + if ((responseHeader.containsKey("PRAGMA")) && + (((String) responseHeader.get("Pragma")).toUpperCase().equals("NO-CACHE"))) return false; + + // calculate often needed values for freshness attributes + Date date = responseHeader.date(); + Date expires = responseHeader.expires(); + Date lastModified = responseHeader.lastModified(); + String cacheControl = (String) responseHeader.get("Cache-Control"); + + + // see for documentation also: + // http://www.web-caching.com/cacheability.html + // http://vancouver-webpages.com/CacheNow/ + + // look for freshnes information + // if we don't have any freshnes indication, we treat the file as stale. + // no handle for freshness control: + if ((expires == null) && (cacheControl == null) && (lastModified == null)) return false; + + // -expires in cached response + // the expires value gives us a very easy hint when the cache is stale + if (expires != null) { + Date yesterday = new Date((new Date()).getTime() - oneday); + if (expires.before(yesterday)) return false; + } + + // -lastModified in cached response + // we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read + // of the file and the last modified date as the age of the file. If we consider the file as + // middel-aged then, the maximum TTL would be cache-creation plus age. + // This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache + // file may only be treated as fresh for one more month, not more. + if (lastModified != null) { + if (date == null) date = new Date(); + long age = date.getTime() - lastModified.getTime(); + if (age < 0) return false; + // TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10 + // the actual living-time is new Date().getTime() - d2.getTime() + // therefore the cache is stale, if Date().getTime() - d2.getTime() > age/10 + if ((new Date()).getTime() - date.getTime() > age / 10) return false; + } + + // -cache-control in cached response + // the cache-control has many value options. + if (cacheControl != null) { + cacheControl = cacheControl.trim().toUpperCase(); + if (cacheControl.startsWith("PUBLIC")) { + // ok, do nothing + } else if ((cacheControl.startsWith("PRIVATE")) || + (cacheControl.startsWith("NO-CACHE")) || + (cacheControl.startsWith("NO-STORE"))) { + // easy case + return false; + } else if (cacheControl.startsWith("MAX-AGE=")) { + // we need also the load date + if (date == null) return false; + try { + long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live + if ((new Date()).getTime() - date.getTime() > ttl) { + return false; + } + } catch (Exception e) { + return false; + } + } + } + + return true; + } + + + public String shallIndexCacheForProxy() { // decide upon header information if a specific file should be indexed // this method returns null if the answer is 'YES'! // if the answer is 'NO' (do not index), it returns a string with the reason @@ -670,10 +800,8 @@ public class plasmaHTCache { // thus we do not care about it here for indexing // -pragma in cached response - /* if ((responseHeader.containsKey("PRAGMA")) && (((String) responseHeader.get("Pragma")).toUpperCase().equals("NO-CACHE"))) return "Denied_(pragma_no_cache)"; - */ // see for documentation also: // http://www.web-caching.com/cacheability.html @@ -732,126 +860,69 @@ public class plasmaHTCache { return null; } - - public boolean shallUseCache() { - // decide upon header information if a specific file should be taken from the cache or not - - //System.out.println("SHALL READ CACHE: requestHeader = " + requestHeader.toString() + ", responseHeader = " + responseHeader.toString()); + + + public String shallIndexCacheForCrawler() { + // decide upon header information if a specific file should be indexed + // this method returns null if the answer is 'YES'! + // if the answer is 'NO' (do not index), it returns a string with the reason + // to reject the crawling demand in clear text + // check profile + if (!(profile.localIndexing())) return "Indexing_Not_Allowed"; + // -CGI access in request // CGI access makes the page very individual, and therefore not usable in caches - if (isPOST(urlString)) return false; - if (isCGI(urlString)) return false; + if ((isPOST(urlString)) && (!(profile.crawlingQ()))) return "Dynamic_(POST)"; + if ((isCGI(urlString)) && (!(profile.crawlingQ()))) return "Dynamic_(CGI)"; // -authorization cases in request - if (requestHeader.containsKey("AUTHORIZATION")) return false; + // we checked that in shallStoreCache // -ranges in request - // we do not cache partial content - if ((requestHeader != null) && (requestHeader.containsKey("RANGE"))) return false; + // we checked that in shallStoreCache - //Date d1, d2; + // a picture cannot be indexed + if (isPicture(responseHeader)) return "Media_Content_(Picture)"; + if (!(isText(responseHeader))) return "Media_Content_(not_text)"; + if (noIndexingURL(urlString)) return "Media_Content_(forbidden)"; // -if-modified-since in request - // The entity has to be transferred only if it has - // been modified since the date given by the If-Modified-Since header. - if (requestHeader.containsKey("IF-MODIFIED-SINCE")) { - // checking this makes only sense if the cached response contains - // a Last-Modified field. If the field does not exist, we go the safe way - if (!(responseHeader.containsKey("Last-Modified"))) return false; - // parse date - Date d1, d2; - d2 = responseHeader.lastModified(); if (d2 == null) d2 = new Date(); - d1 = requestHeader.ifModifiedSince(); if (d1 == null) d1 = new Date(); - // finally, we shall treat the cache as stale if the modification time is after the if-.. time - if (d2.after(d1)) return false; - } + // if the page is fresh at the very moment we can index it + // -> this does not apply for the crawler - boolean isNotPicture = !isPicture(responseHeader); - // -cookies in request - // unfortunately, we should reload in case of a cookie - // but we think that pictures can still be considered as fresh - if ((requestHeader.containsKey("COOKIE")) && (isNotPicture)) return false; - - // -set-cookie in cached response - // this is a similar case as for COOKIE. - if ((responseHeader.containsKey("SET-COOKIE")) && (isNotPicture)) return false; // too strong - if ((responseHeader.containsKey("SET-COOKIE2")) && (isNotPicture)) return false; // too strong - - // -pragma in cached response - // logically, we would not need to care about no-cache pragmas in cached response headers, - // because they cannot exist since they are not written to the cache. - // So this IF should always fail.. - if ((responseHeader.containsKey("PRAGMA")) && - (((String) responseHeader.get("Pragma")).toUpperCase().equals("NO-CACHE"))) return false; - - // calculate often needed values for freshness attributes - Date date = responseHeader.date(); - Date expires = responseHeader.expires(); - Date lastModified = responseHeader.lastModified(); - String cacheControl = (String) responseHeader.get("Cache-Control"); - - - // see for documentation also: - // http://www.web-caching.com/cacheability.html - // http://vancouver-webpages.com/CacheNow/ + // unfortunately, we cannot index pages which have been requested with a cookie + // because the returned content may be special for the client + // -> this does not apply for a crawler + + // -set-cookie in response + // the set-cookie from the server does not indicate that the content is special + // thus we do not care about it here for indexing + // -> this does not apply for a crawler + // -pragma in cached response + // -> in the crawler we ignore this + // look for freshnes information - // if we don't have any freshnes indication, we treat the file as stale. - // no handle for freshness control: - if ((expires == null) && (cacheControl == null) && (lastModified == null)) return false; // -expires in cached response // the expires value gives us a very easy hint when the cache is stale - if (expires != null) { - Date yesterday = new Date((new Date()).getTime() - oneday); - if (expires.before(yesterday)) return false; - } + // sometimes, the expires date is set to the past to prevent that a page is cached + // we use that information to see if we should index it + // -> this does not apply for a crawler // -lastModified in cached response - // we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read - // of the file and the last modified date as the age of the file. If we consider the file as - // middel-aged then, the maximum TTL would be cache-creation plus age. - // This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache - // file may only be treated as fresh for one more month, not more. - if (lastModified != null) { - if (date == null) date = new Date(); - long age = date.getTime() - lastModified.getTime(); - if (age < 0) return false; - // TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10 - // the actual living-time is new Date().getTime() - d2.getTime() - // therefore the cache is stale, if Date().getTime() - d2.getTime() > age/10 - if ((new Date()).getTime() - date.getTime() > age / 10) return false; - } + // this information is too weak to use it to prevent indexing + // even if we can apply a TTL heuristic for cache usage // -cache-control in cached response // the cache-control has many value options. - if (cacheControl != null) { - cacheControl = cacheControl.trim().toUpperCase(); - if (cacheControl.startsWith("PUBLIC")) { - // ok, do nothing - } else if ((cacheControl.startsWith("PRIVATE")) || - (cacheControl.startsWith("NO-CACHE")) || - (cacheControl.startsWith("NO-STORE"))) { - // easy case - return false; - } else if (cacheControl.startsWith("MAX-AGE=")) { - // we need also the load date - if (date == null) return false; - try { - long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live - if ((new Date()).getTime() - date.getTime() > ttl) { - return false; - } - } catch (Exception e) { - return false; - } - } - } + // -> in the crawler we ignore this - return true; + return null; } + } } diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 747f9c7eb..aafb9e83f 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -59,17 +59,25 @@ public class plasmaParser { } - public document parse(URL location, String mimeType, byte[] source) { + public document parseSource(URL location, String mimeType, byte[] source) { // make a scraper and transformer htmlFilterContentScraper scraper = new htmlFilterContentScraper(location); OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false); try { hfos.write(source); + return transformScraper(location, mimeType, scraper); + } catch (IOException e) { + return null; + } + } + + public document transformScraper(URL location, String mimeType, htmlFilterContentScraper scraper) { + try { return new document(new URL(urlNormalform(location)), - mimeType, null, null, scraper.getHeadline(), - null, null, - scraper.getText(), scraper.getAnchor(), scraper.getImage()); - } catch (Exception e) { + mimeType, null, null, scraper.getHeadline(), + null, null, + scraper.getText(), scraper.getAnchors(), scraper.getImages()); + } catch (MalformedURLException e) { return null; } } @@ -89,8 +97,6 @@ public class plasmaParser { return us; } - - public class document { URL location; // the source url diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 633f4610e..68aa6c746 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -147,6 +147,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi public HashSet extensionBlack; public HashMap outgoingCookies, incomingCookies; public kelondroTables facilityDB; + public plasmaParser parser; public int serverJobs; public boolean terminate = false; @@ -203,28 +204,10 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi // make crawl profiles database and default profiles profiles = new plasmaCrawlProfile(new File(plasmaPath, "crawlProfiles0.db")); - - //System.out.println("profiles.size=" + profiles.size()); - //System.out.println("profile-config=" + getConfig("defaultProxyProfile", "").length()); - //System.out.println("profile-entry=" + profiles.getEntry(getConfig("defaultProxyProfile", "")).toString()); - if ((profiles.size() == 0) || - (getConfig("defaultProxyProfile", "").length() == 0) || - (profiles.getEntry(getConfig("defaultProxyProfile", "")) == null)) { - // generate new default entry for proxy crawling - defaultProxyProfile = profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), false, true, true, true, false, true, true, true); - setConfig("defaultProxyProfile", defaultProxyProfile.handle()); - } else { - defaultProxyProfile = profiles.getEntry(getConfig("defaultProxyProfile", "")); - } - if ((profiles.size() == 1) || - (getConfig("defaultRemoteProfile", "").length() == 0) || - (profiles.getEntry(getConfig("defaultRemoteProfile", "")) == null)) { - // generate new default entry for proxy crawling - defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, false, false, true, true, false, true, true, false); - setConfig("defaultRemoteProfile", defaultRemoteProfile.handle()); - } else { - defaultRemoteProfile = profiles.getEntry(getConfig("defaultRemoteProfile", "")); - } + initProfiles(); + + // make parser + parser = new plasmaParser(new File("")); // start indexing management loadedURL = new plasmaCrawlLURL(new File(plasmaPath, "urlHash.db"), ramLURL); @@ -309,14 +292,46 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi this.serverJobs = jobs; } + private void initProfiles() throws IOException { + if ((profiles.size() == 0) || + (getConfig("defaultProxyProfile", "").length() == 0) || + (profiles.getEntry(getConfig("defaultProxyProfile", "")) == null)) { + // generate new default entry for proxy crawling + defaultProxyProfile = profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), false, true, true, true, false, true, true, true); + setConfig("defaultProxyProfile", defaultProxyProfile.handle()); + } else { + defaultProxyProfile = profiles.getEntry(getConfig("defaultProxyProfile", "")); + } + if ((profiles.size() == 1) || + (getConfig("defaultRemoteProfile", "").length() == 0) || + (profiles.getEntry(getConfig("defaultRemoteProfile", "")) == null)) { + // generate new default entry for proxy crawling + defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, false, false, true, true, false, true, true, false); + setConfig("defaultRemoteProfile", defaultRemoteProfile.handle()); + } else { + defaultRemoteProfile = profiles.getEntry(getConfig("defaultRemoteProfile", "")); + } + } + private void resetProfiles() { + File pdb = new File(plasmaPath, "crawlProfiles0.db"); + if (pdb.exists()) pdb.delete(); + try { + profiles = new plasmaCrawlProfile(pdb); + initProfiles(); + } catch (IOException e) {} + } private void cleanProfiles() { if (totalSize() > 0) return; Iterator i = profiles.profiles(true); plasmaCrawlProfile.entry entry; - while (i.hasNext()) { - entry = (plasmaCrawlProfile.entry) i.next(); - if (!((entry.name().equals("proxy")) || (entry.name().equals("remote")))) i.remove(); - } + try { + while (i.hasNext()) { + entry = (plasmaCrawlProfile.entry) i.next(); + if (!((entry.name().equals("proxy")) || (entry.name().equals("remote")))) i.remove(); + } + } catch (kelondroException e) { + resetProfiles(); + } } public plasmaHTCache getCacheManager() { @@ -454,7 +469,8 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi private synchronized void processResourceStack(plasmaHTCache.Entry entry) { // work off one stack entry with a fresh resource (scraped web page) - if (entry.scraper != null) try { + byte[] content; + if (((content = entry.getContentBytes()).length > 0) || (entry.scraper != null)) try { // we must distinguish the following cases: resource-load was initiated by // 1) global crawling: the index is extern, not here (not possible here) // 2) result of search queries, some indexes are here (not possible here) @@ -479,10 +495,20 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi log.logDebug("processResourceStack: processCase=" + processCase + ", depth=" + entry.depth + ", maxDepth=" + entry.profile.generalDepth() + ", filter=" + entry.profile.generalFilter() + ", initiatorHash=" + initiatorHash + ", status=" + entry.status + ", url=" + entry.url); // DEBUG + // parse content + plasmaParser.document document; + if (entry.scraper != null) { + log.logDebug("(Parser) '" + entry.urlString + "' is pre-parsed by scraper"); + document = parser.transformScraper(entry.url, entry.responseHeader.mime(), entry.scraper); + } else { + log.logDebug("(Parser) '" + entry.urlString + "' is not parsed, parsing now"); + document = parser.parseSource(entry.url, entry.responseHeader.mime(), content); + } + // put anchors on crawl stack if (((processCase == 4) || (processCase == 5)) && (entry.depth < entry.profile.generalDepth())) { - Map hl = entry.scraper.getHyperlinks(); + Map hl = document.getHyperlinks(); Iterator i = hl.entrySet().iterator(); String nexturlstring; String rejectReason; @@ -500,18 +526,26 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi } } log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.url.toString() + - ", NEW CRAWL STACK SIZE IS " + noticeURL.localStackSize()); + ", NEW CRAWL STACK SIZE IS " + noticeURL.localStackSize()); } // create index - String noIndexReason; - String descr = entry.scraper.getHeadline(); + + String descr = document.getMainLongTitle(); URL referrerURL = entry.referrerURL(); String referrerHash = (referrerURL == null) ? plasmaURL.dummyHash : plasmaURL.urlHash(referrerURL); - if ((noIndexReason = entry.shallIndexCache()) == null ) { + String noIndexReason = "unspecified"; + if (processCase == 4) { + // proxy-load + noIndexReason = entry.shallIndexCacheForProxy(); + } else { + // normal crawling + noIndexReason = entry.shallIndexCacheForCrawler(); + } + if (noIndexReason == null) { // strip out words log.logDebug("(Profile) Condensing for '" + entry.urlString + "'"); - plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(entry.scraper.getText())); + plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(document.getText())); //log.logInfo("INDEXING HEADLINE:" + descr); try { @@ -573,7 +607,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi } // explicit delete/free resources - entry.scraper = null; entry = null; + document = null; entry = null; } catch (IOException e) { log.logError("ERROR in plasmaSwitchboard.process(): " + e.toString()); } @@ -1310,6 +1344,10 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi log.logError("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage()); e.printStackTrace(); return new plasmaWordIndexEntity[0]; + } catch (kelondroException e) { + log.logError("selectTransferIndexes database corrupted: " + e.getMessage()); + e.printStackTrace(); + return new plasmaWordIndexEntity[0]; } } diff --git a/source/de/anomic/plasma/plasmaWordIndexFileCache.java b/source/de/anomic/plasma/plasmaWordIndexFileCache.java index 517686efb..c714ce6f7 100644 --- a/source/de/anomic/plasma/plasmaWordIndexFileCache.java +++ b/source/de/anomic/plasma/plasmaWordIndexFileCache.java @@ -62,6 +62,7 @@ package de.anomic.plasma; import java.io.*; import java.util.*; +import de.anomic.server.*; import de.anomic.kelondro.*; public class plasmaWordIndexFileCache { @@ -72,24 +73,43 @@ public class plasmaWordIndexFileCache { // class variables private File databaseRoot; private kelondroTree indexCache; + private int bufferkb; public plasmaWordIndexFileCache(File databaseRoot, int bufferkb) throws IOException { this.databaseRoot = databaseRoot; + this.bufferkb = bufferkb; File indexCacheFile = new File(databaseRoot, indexCacheFileName); if (indexCacheFile.exists()) { // simply open the file indexCache = new kelondroTree(indexCacheFile, bufferkb * 0x400); } else { - // create a new file - int[] columns = new int[buffers + 2]; - columns[0] = plasmaWordIndexEntry.wordHashLength; - columns[1] = 1; - for (int i = 0; i < buffers; i++) columns[i + 2] = plasmaCrawlLURL.urlHashLength + plasmaWordIndexEntry.attrSpaceShort; - indexCache = new kelondroTree(indexCacheFile, bufferkb * 0x400, columns); + createCacheFile(indexCacheFile); } } - + private void resetCacheFile() { + // this has to be used in emergencies only + // it can happen that there is a serious db inconsistency; in that case we re-create the indexCache + try { indexCache.close(); } catch (IOException e) {} + File indexCacheFile = new File(databaseRoot, indexCacheFileName); + if (indexCacheFile.exists()) indexCacheFile.delete(); + try { + createCacheFile(indexCacheFile); + } catch (IOException e) { + de.anomic.server.serverLog.logError("PLASMA", "plasmaWordIndexFileCache.resetCacheFile: serious failure creating the cache file: " + e.getMessage()); + indexCache = null; + } + } + + private void createCacheFile(File indexCacheFile) throws IOException { + // create a new file + int[] columns = new int[buffers + 2]; + columns[0] = plasmaWordIndexEntry.wordHashLength; + columns[1] = 1; + for (int i = 0; i < buffers; i++) columns[i + 2] = plasmaCrawlLURL.urlHashLength + plasmaWordIndexEntry.attrSpaceShort; + indexCache = new kelondroTree(indexCacheFile, bufferkb * 0x400, columns); + } + protected void close() throws IOException { indexCache.close(); indexCache = null; @@ -162,8 +182,12 @@ public class plasmaWordIndexFileCache { indexCache.put(row); } catch (kelondroException e) { // this is a very bad case; a database inconsistency occurred - deleteComplete(wordHash); - System.out.println("fatal error in plasmaWordIndexFileCacle.addEntriesToIndex: write to word hash file " + wordHash + " failed - " + e.getMessage() + " - index deleted."); + serverLog.logError("PLASMA", "fatal error in plasmaWordIndexFileCache.addEntriesToIndex: write of " + wordHash + " to index cache failed - " + e.getMessage() + " - indexCache.db deleted"); + resetCacheFile(); + } catch (IOException e) { + // this is a very bad case; a database inconsistency occurred + serverLog.logError("PLASMA", "fatal error in plasmaWordIndexFileCache.addEntriesToIndex: write of " + wordHash + " to index cache failed - " + e.getMessage() + " - indexCache.db deleted"); + resetCacheFile(); } } // finished! diff --git a/source/yacy.java b/source/yacy.java index 51e2d4c15..eca12829c 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -208,7 +208,7 @@ public class yacy { } // init parser - de.anomic.htmlFilter.htmlFilterContentScraper.mediaExt = sb.getConfig("mediaExt",""); + de.anomic.plasma.plasmaParser.mediaExt = sb.getConfig("mediaExt",""); // start main threads try { diff --git a/yacy.blue b/yacy.blue new file mode 100644 index 000000000..d2833eb81 --- /dev/null +++ b/yacy.blue @@ -0,0 +1 @@ +testblue diff --git a/yacy.init b/yacy.init index f0f62fe49..97b01a63b 100644 --- a/yacy.init +++ b/yacy.init @@ -166,16 +166,8 @@ remoteProxyUse=false #remoteProxyUse=true # the proxy may filter the content of transferred web pages -# this is archieved using a special filtering class that can be -# exchanged like a transformation plug-in -# If you want to do this, you must implement the htmlFilterTransformer -# -Interface and set the name of the implementing class here. -# As a default, we use a filtering Transformer that takes a blacklist -# and blocks all text fragments where a word from the blacklist appears -# as the blacklist, we use the search-engine's blue-list -# please see that class as an implementation example for your own transformers -pageTransformerClass=htmlFilterContentTransformer -pageTransformerArg=yacy.blue +# the bluelist removes specific keywords from web pages +proxyBlueList=yacy.blue # security settigns # we provide proxy and server security through a 2-stage security gate: