very experimental integration of the new generic parser and optional disabling of bluelist filtering in proxy. Does not yet work properly. To disable the disable-feature, the presence of a non-empty bluelist is necessary

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@17 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 96516fc9d8
commit e7d055b98e

@ -114,11 +114,12 @@ public class CacheAdmin_p {
else { else {
htmlFilterContentScraper scraper = new htmlFilterContentScraper(url); htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
plasmaParser.document document = switchboard.parser.transformScraper(url, "text/html", scraper);
serverFileUtils.copy(file, os); serverFileUtils.copy(file, os);
info += "<b>HEADLINE:</b><br>" + scraper.getHeadline() + "<br><br>"; info += "<b>HEADLINE:</b><br>" + scraper.getHeadline() + "<br><br>";
info += "<b>HREF:</b><br>" + formatAnchor(scraper.getHyperlinks()) + "<br>"; info += "<b>HREF:</b><br>" + formatAnchor(document.getHyperlinks()) + "<br>";
info += "<b>MEDIA:</b><br>" + formatAnchor(scraper.getMedialinks()) + "<br>"; info += "<b>MEDIA:</b><br>" + formatAnchor(document.getMedialinks()) + "<br>";
info += "<b>EMAIL:</b><br>" + formatAnchor(scraper.getEmaillinks()) + "<br>"; info += "<b>EMAIL:</b><br>" + formatAnchor(document.getEmaillinks()) + "<br>";
info += "<b>TEXT:</b><br><span class=\"small\">" + new String(scraper.getText()) + "</span><br>"; info += "<b>TEXT:</b><br><span class=\"small\">" + new String(scraper.getText()) + "</span><br>";
} }
} catch (Exception e) { } catch (Exception e) {

@ -305,7 +305,7 @@ public class IndexCreate_p {
prop.put("indexing-queue_list_"+i+"_initiator", ((initiator == null) ? "proxy" : initiator.getName())); prop.put("indexing-queue_list_"+i+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("indexing-queue_list_"+i+"_depth", pcentry.depth); prop.put("indexing-queue_list_"+i+"_depth", pcentry.depth);
prop.put("indexing-queue_list_"+i+"_modified", daydate(pcentry.lastModified)); prop.put("indexing-queue_list_"+i+"_modified", daydate(pcentry.lastModified));
prop.put("indexing-queue_list_"+i+"_href",((pcentry.scraper == null) ? "0" : ("" + pcentry.scraper.getHyperlinks().size()))); prop.put("indexing-queue_list_"+i+"_href",((pcentry.scraper == null) ? "0" : ("" + pcentry.scraper.getAnchors().size())));
prop.put("indexing-queue_list_"+i+"_anchor", ((pcentry.scraper == null) ? "-" : pcentry.scraper.getHeadline()) ); prop.put("indexing-queue_list_"+i+"_anchor", ((pcentry.scraper == null) ? "-" : pcentry.scraper.getHeadline()) );
prop.put("indexing-queue_list_"+i+"_url", pcentry.urlString); prop.put("indexing-queue_list_"+i+"_url", pcentry.urlString);
dark = !dark; dark = !dark;

@ -52,10 +52,6 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
private static HashSet linkTags0; private static HashSet linkTags0;
private static HashSet linkTags1; private static HashSet linkTags1;
public static String mediaExt =
"swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," +
"sit,hqx,img,dmg,tar,gz,ps,pdf,doc,xls,ppt,ram,bz2,arj";
static { static {
linkTags0 = new HashSet(); linkTags0 = new HashSet();
linkTags0.add("img"); linkTags0.add("img");
@ -67,8 +63,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
} }
// class variables: collectors for links // class variables: collectors for links
private Properties anchor; private HashMap anchors;
private Properties image; private HashMap images;
private String title; private String title;
private String headline; private String headline;
private serverByteBuffer text; private serverByteBuffer text;
@ -79,8 +75,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
// it is only the reference for relative links // it is only the reference for relative links
super(linkTags0, linkTags1); super(linkTags0, linkTags1);
this.root = root; this.root = root;
this.anchor = new Properties(); this.anchors = new HashMap();
this.image = new Properties(); this.images = new HashMap();
this.title = ""; this.title = "";
this.headline = ""; this.headline = "";
this.text = new serverByteBuffer(); this.text = new serverByteBuffer();
@ -117,12 +113,12 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
} }
public void scrapeTag0(String tagname, Properties tagopts) { public void scrapeTag0(String tagname, Properties tagopts) {
if (tagname.equals("img")) image.setProperty(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt","")); if (tagname.equals("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt",""));
} }
public void scrapeTag1(String tagname, Properties tagopts, byte[] text) { public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
//System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text)); //System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
if (tagname.equals("a")) anchor.setProperty(absolutePath(tagopts.getProperty("href", "")), if (tagname.equals("a")) anchors.put(absolutePath(tagopts.getProperty("href", "")),
new serverByteBuffer(super.stripAll(new serverByteBuffer(text)).getBytes()).trim().toString()); new serverByteBuffer(super.stripAll(new serverByteBuffer(text)).getBytes()).trim().toString());
if (tagname.equals("h1")) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes()); if (tagname.equals("h1")) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
if (tagname.equals("title")) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes()); if (tagname.equals("title")) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
@ -153,179 +149,19 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return text.getBytes(); return text.getBytes();
} }
public Properties getAnchor() { public Map getAnchors() {
return anchor; return anchors;
}
public Properties getImage() {
return image;
}
public Map getHyperlinks() {
if (hyperlinks == null) resortLinks();
return hyperlinks;
}
public Map getMedialinks() {
if (medialinks == null) resortLinks();
return medialinks;
}
public Map getEmaillinks() {
if (emaillinks == null) resortLinks();
return emaillinks;
}
HashMap hyperlinks = null;
HashMap medialinks = null;
HashMap emaillinks = null;
private synchronized void resortLinks() {
Iterator i;
String url;
int extpos;
String ext;
i = anchor.entrySet().iterator();
hyperlinks = new HashMap();
medialinks = new HashMap();
emaillinks = new HashMap();
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
url = (String) entry.getKey();
if ((url != null) && (url.startsWith("mailto:"))) {
emaillinks.put(url.substring(7), entry.getValue());
} else {
extpos = url.lastIndexOf(".");
String normal;
if (extpos > 0) {
ext = url.substring(extpos).toLowerCase();
normal = urlNormalform(url);
if (normal != null) {
if (mediaExt.indexOf(ext.substring(1)) >= 0) {
// this is not an normal anchor, its a media link
medialinks.put(normal, entry.getValue());
} else {
hyperlinks.put(normal, entry.getValue());
}
}
}
}
}
// finally add the images to the medialinks
i = image.entrySet().iterator();
String normal;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
url = (String) entry.getKey();
normal = urlNormalform(url);
if (normal != null) medialinks.put(normal, entry.getValue()); // avoid NullPointerException
}
expandHyperlinks();
}
/*
private synchronized void resortLinks() {
Enumeration e;
String url;
int extpos;
String ext;
e = anchor.propertyNames();
hyperlinks = new Properties();
medialinks = new Properties();
emaillinks = new Properties();
while (e.hasMoreElements()) {
url = (String) e.nextElement();
if ((url != null) && (url.startsWith("mailto:"))) {
emaillinks.setProperty(url.substring(7), anchor.getProperty(url));
} else {
extpos = url.lastIndexOf(".");
String normal;
if (extpos > 0) {
ext = url.substring(extpos).toLowerCase();
normal = urlNormalform(url);
if (normal != null) {
if (mediaExt.indexOf(ext.substring(1)) >= 0) {
// this is not an normal anchor, its a media link
medialinks.setProperty(normal, anchor.getProperty(url));
} else {
hyperlinks.setProperty(normal, anchor.getProperty(url));
}
}
}
}
}
// finally add the images to the medialinks
e = image.propertyNames();
String normal;
while (e.hasMoreElements()) {
url = (String) e.nextElement();
normal = urlNormalform(url);
if (normal != null) medialinks.setProperty(normal, image.getProperty(url)); // avoid NullPointerException
}
} }
*/
public synchronized void expandHyperlinks() { public Map getImages() {
// we add artificial hyperlinks to the hyperlink set that can be calculated from return images;
// given hyperlinks and imagelinks
hyperlinks.putAll(allReflinks(hyperlinks));
hyperlinks.putAll(allReflinks(medialinks));
hyperlinks.putAll(allSubpaths(hyperlinks));
hyperlinks.putAll(allSubpaths(medialinks));
} }
private static Map allReflinks(Map links) {
// we find all links that are part of a reference inside a url
HashMap v = new HashMap();
Iterator i = links.keySet().iterator();
String s;
int pos;
loop: while (i.hasNext()) {
s = (String) i.next();
if ((pos = s.toLowerCase().indexOf("http://",7)) > 0) {
i.remove();
s = s.substring(pos);
while ((pos = s.toLowerCase().indexOf("http://",7)) > 0) s = s.substring(pos);
if (!(v.containsKey(s))) v.put(s, "ref");
continue loop;
}
if ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) {
i.remove();
s = "http:/" + s.substring(pos);
while ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) s = "http:/" + s.substring(pos);
if (!(v.containsKey(s))) v.put(s, "ref");
continue loop;
}
}
return v;
}
private static Map allSubpaths(Map links) {
HashMap v = new HashMap();
Iterator i = links.keySet().iterator();
String s;
int pos;
while (i.hasNext()) {
s = (String) i.next();
if (s.endsWith("/")) s = s.substring(0, s.length() - 1);
pos = s.lastIndexOf("/");
while (pos > 8) {
s = s.substring(0, pos + 1);
if (!(v.containsKey(s))) v.put(s, "sub");
s = s.substring(0, pos);
pos = s.lastIndexOf("/");
}
}
return v;
}
public void print() { public void print() {
System.out.println("TITLE :" + title); System.out.println("TITLE :" + title);
System.out.println("HEADLINE:" + headline); System.out.println("HEADLINE:" + headline);
System.out.println("ANCHORS :" + anchor.toString()); System.out.println("ANCHORS :" + anchors.toString());
System.out.println("IMAGES :" + image.toString()); System.out.println("IMAGES :" + images.toString());
System.out.println("TEXT :" + new String(text.getBytes())); System.out.println("TEXT :" + new String(text.getBytes()));
} }

@ -65,6 +65,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
} }
public void init(String initarg) { public void init(String initarg) {
System.out.println("Transformer init: " + initarg);
if (bluelist == null) { if (bluelist == null) {
// here, the initarg is used to load a list of bluelisted words // here, the initarg is used to load a list of bluelisted words
bluelist = new Vector(); bluelist = new Vector();
@ -78,9 +79,14 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
r.close(); r.close();
} catch (Exception e) { } catch (Exception e) {
} }
if (bluelist.size() == 0) System.out.println("BLUELIST is empty");
} }
} }
public boolean isIdentityTransformer() {
return bluelist.size() == 0;
}
private static byte[] genBlueLetters(int length) { private static byte[] genBlueLetters(int length) {
serverByteBuffer bb = new serverByteBuffer(" <FONT COLOR=#0000FF>".getBytes()); serverByteBuffer bb = new serverByteBuffer(" <FONT COLOR=#0000FF>".getBytes());
length = length / 2; length = length / 2;

@ -49,6 +49,11 @@ public interface htmlFilterTransformer {
// more specific transformation rules // more specific transformation rules
public void init(String initarg); public void init(String initarg);
// ask if this transformer will do any transformation whatsoever
// this may return true if the initialization resultet in a status
// that does not allow any transformation
public boolean isIdentityTransformer();
// tests, if a given body-less tag (i.e. <br> shall be supervised) // tests, if a given body-less tag (i.e. <br> shall be supervised)
// only tags that are defined here will be cached and not streamed // only tags that are defined here will be cached and not streamed
public boolean isTag0(String tag); public boolean isTag0(String tag);

@ -125,15 +125,9 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
if (!(htRootPath.exists())) htRootPath.mkdir(); if (!(htRootPath.exists())) htRootPath.mkdir();
} }
// load a transformer // load a transformer
try { transformer = new htmlFilterContentTransformer();
ClassLoader cp = new serverClassLoader(this.getClass().getClassLoader()); transformer.init(new File(switchboard.getRootPath(), switchboard.getConfig("plasmaBlueList", "")).toString());
Class transformerClass = cp.loadClass(switchboard.getConfig("pageTransformerClass", ""));
transformer = (htmlFilterTransformer) transformerClass.newInstance();
transformer.init(switchboard.getConfig("pageTransformerArg", "")); // this is usually the blueList
} catch (Exception e) {
transformer = null;
}
String f; String f;
// load the yellow-list // load the yellow-list
@ -396,7 +390,7 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
if (cacheExists) { if (cacheExists) {
// we respond on the request by using the cache // we respond on the request by using the cache
hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK", cachedResponseHeader, null, null, switchboard.defaultProxyProfile); hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK", cachedResponseHeader, null, switchboard.defaultProxyProfile);
if (hpc.shallUseCache()) { if (hpc.shallUseCache()) {
// the cache is fresh // the cache is fresh
@ -426,7 +420,8 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
respondHeader(respond, "203 OK", cachedResponseHeader); // respond with 'non-authoritative' respondHeader(respond, "203 OK", cachedResponseHeader); // respond with 'non-authoritative'
// make a transformer // make a transformer
if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) && if ((!(transformer.isIdentityTransformer())) &&
((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
((cachedResponseHeader == null) || (httpd.isTextMime(cachedResponseHeader.mime(), switchboard.mimeWhite)))) { ((cachedResponseHeader == null) || (httpd.isTextMime(cachedResponseHeader.mime(), switchboard.mimeWhite)))) {
hfos = new htmlFilterOutputStream(respond, null, transformer, (ext.length() == 0)); hfos = new htmlFilterOutputStream(respond, null, transformer, (ext.length() == 0));
} else { } else {
@ -472,24 +467,30 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
res = remote.GET(remotePath, requestHeader); res = remote.GET(remotePath, requestHeader);
long contentLength = res.responseHeader.contentLength(); long contentLength = res.responseHeader.contentLength();
// make a scraper and transformer // reserver cache entry
hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile);
// make a scraper and transformer
if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) && if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
(httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) { (httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) {
scraper = new htmlFilterContentScraper(url); if (transformer.isIdentityTransformer()) {
hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0)); hfos = hpc.getContentOutputStream();
if (((htmlFilterOutputStream) hfos).binarySuspect()) { } else {
scraper = null; // forget it, may be rubbish scraper = new htmlFilterContentScraper(url);
log.logDebug("Content of " + url + " is probably binary. deleted scraper."); hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0));
if (((htmlFilterOutputStream) hfos).binarySuspect()) {
scraper = null; // forget it, may be rubbish
log.logDebug("Content of " + url + " is probably binary. deleted scraper.");
}
hpc.scraper = scraper;
} }
} else { } else {
log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped"); log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped");
scraper = null; scraper = null;
hfos = respond; hfos = respond;
hpc.scraper = scraper;
} }
// reserver cache entry
hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, scraper, null, switchboard.defaultProxyProfile);
// handle incoming cookies // handle incoming cookies
handleIncomingCookies(res.responseHeader, host, ip); handleIncomingCookies(res.responseHeader, host, ip);
@ -502,7 +503,13 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
if ((contentLength > 0) && // known if ((contentLength > 0) && // known
(contentLength < 1048576)) // 1 MB (contentLength < 1048576)) // 1 MB
{ {
byte[] cacheArray = res.writeContent(hfos); byte[] cacheArray;
if (transformer.isIdentityTransformer()) {
res.writeContentX(hfos, respond);
cacheArray = hpc.getContentBytes();
} else {
cacheArray = res.writeContent(hfos);
}
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
// before we came here we deleted a cache entry // before we came here we deleted a cache entry
if (sizeBeforeDelete == cacheArray.length) { if (sizeBeforeDelete == cacheArray.length) {
@ -514,8 +521,16 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
cacheManager.stackProcess(hpc, cacheArray); // necessary update, write response header to cache cacheManager.stackProcess(hpc, cacheArray); // necessary update, write response header to cache
} }
} else { } else {
// the file is too big to cache it in the ram, write to file
cacheFile.getParentFile().mkdirs(); cacheFile.getParentFile().mkdirs();
res.writeContent(hfos, cacheFile); if (transformer.isIdentityTransformer()) {
res.writeContent(respond, cacheFile);
if (contentLength < 10485760) { // 10 mb
serverFileUtils.copy(cacheFile, hfos);
} // else hfos is empty and that means: no work afterwards with it
} else {
res.writeContent(hfos, cacheFile);
}
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
// before we came here we deleted a cache entry // before we came here we deleted a cache entry
if (sizeBeforeDelete == cacheFile.length()) { if (sizeBeforeDelete == cacheFile.length()) {
@ -579,24 +594,30 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
httpc.response res = remote.GET(remotePath, requestHeader); httpc.response res = remote.GET(remotePath, requestHeader);
long contentLength = res.responseHeader.contentLength(); long contentLength = res.responseHeader.contentLength();
// make a scraper and transformer // reserve cache entry
hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile);
// make a scraper and transformer
if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) && if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
(httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) { (httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) {
scraper = new htmlFilterContentScraper(url); if (transformer.isIdentityTransformer()) {
hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0)); hfos = hpc.getContentOutputStream();
if (((htmlFilterOutputStream) hfos).binarySuspect()) { } else {
scraper = null; // forget it, may be rubbish scraper = new htmlFilterContentScraper(url);
log.logDebug("Content of " + url + " is probably binary. deleted scraper."); hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0));
if (((htmlFilterOutputStream) hfos).binarySuspect()) {
scraper = null; // forget it, may be rubbish
log.logDebug("Content of " + url + " is probably binary. deleted scraper.");
}
hpc.scraper = scraper;
} }
} else { } else {
log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped"); log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped");
scraper = null; scraper = null;
hfos = respond; hfos = respond;
hpc.scraper = scraper;
} }
// reserve cache entry
hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, scraper, null, switchboard.defaultProxyProfile);
// handle incoming cookies // handle incoming cookies
handleIncomingCookies(res.responseHeader, host, ip); handleIncomingCookies(res.responseHeader, host, ip);
@ -608,16 +629,29 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
if ((storeError = hpc.shallStoreCache()) == null) { if ((storeError = hpc.shallStoreCache()) == null) {
// we write a new cache entry // we write a new cache entry
if ((contentLength > 0) && (contentLength < 1048576)) { if ((contentLength > 0) && (contentLength < 1048576)) {
// write to buffer // write to buffer
byte[] cacheArray = res.writeContent(hfos); byte[] cacheArray;
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); if (transformer.isIdentityTransformer()) {
res.writeContentX(hfos, respond);
cacheArray = hpc.getContentBytes();
} else {
cacheArray = res.writeContent(hfos);
}
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
// enQueue new entry with response header and file as byte[] // enQueue new entry with response header and file as byte[]
hpc.status = plasmaHTCache.CACHE_FILL; hpc.status = plasmaHTCache.CACHE_FILL;
cacheManager.stackProcess(hpc, cacheArray); cacheManager.stackProcess(hpc, cacheArray);
} else try { } else try {
// write to file system directly // write to file system directly
cacheFile.getParentFile().mkdirs(); cacheFile.getParentFile().mkdirs();
res.writeContent(hfos, cacheFile); if (transformer.isIdentityTransformer()) {
res.writeContent(respond, cacheFile);
if (contentLength < 10485760) { // 10 mb
serverFileUtils.copy(cacheFile, hfos);
} // else hfos is empty and that means: no work afterwards with it
} else {
res.writeContent(hfos, cacheFile);
}
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
// enQueue new entry with response header // enQueue new entry with response header
hpc.status = plasmaHTCache.CACHE_FILL; hpc.status = plasmaHTCache.CACHE_FILL;
@ -711,9 +745,6 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
} }
} }
public void doHead(Properties conProp, httpHeader requestHeader, OutputStream respond) throws IOException { public void doHead(Properties conProp, httpHeader requestHeader, OutputStream respond) throws IOException {
String method = conProp.getProperty("METHOD"); String method = conProp.getProperty("METHOD");
String host = conProp.getProperty("HOST"); String host = conProp.getProperty("HOST");
@ -834,8 +865,6 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
respond.flush(); respond.flush();
} }
public void doConnect(Properties conProp, de.anomic.http.httpHeader requestHeader, InputStream clientIn, OutputStream clientOut) throws IOException { public void doConnect(Properties conProp, de.anomic.http.httpHeader requestHeader, InputStream clientIn, OutputStream clientOut) throws IOException {
String host = conProp.getProperty("HOST"); String host = conProp.getProperty("HOST");
int port = Integer.parseInt(conProp.getProperty("PORT")); int port = Integer.parseInt(conProp.getProperty("PORT"));

@ -88,7 +88,7 @@ public class plasmaCrawlLoader {
// we kill that thread // we kill that thread
thread.interrupt(); // hopefully this wakes him up. thread.interrupt(); // hopefully this wakes him up.
slots.remove(i); slots.remove(i);
System.out.println("CRAWLER: IGNORING SLEEPING DOWNLOAD SLOT " + thread.url.toString()); log.logDebug("IGNORING SLEEPING DOWNLOAD SLOT " + thread.url.toString());
} }
} else { } else {
// thread i is dead, remove it // thread i is dead, remove it
@ -198,31 +198,26 @@ public class plasmaCrawlLoader {
// the transfer is ok // the transfer is ok
long contentLength = res.responseHeader.contentLength(); long contentLength = res.responseHeader.contentLength();
// make a scraper and transformer
htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
// reserve cache entry // reserve cache entry
plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, requestHeader, res.status, res.responseHeader, scraper, initiator, profile); plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, requestHeader, res.status, res.responseHeader, initiator, profile);
// request has been placed and result has been returned. work off response // request has been placed and result has been returned. work off response
File cacheFile = cacheManager.getCachePath(url); File cacheFile = cacheManager.getCachePath(url);
try { try {
if (!(httpd.isTextMime(res.responseHeader.mime().toLowerCase(), acceptMimeTypes))) { if (!(httpd.isTextMime(res.responseHeader.mime().toLowerCase(), acceptMimeTypes))) {
// if the response has not the right file type then reject file // if the response has not the right file type then reject file
hfos.close();
remote.close(); remote.close();
System.out.println("REJECTED WRONG MIME TYPE " + res.responseHeader.mime() + " for url " + url.toString()); log.logInfo("REJECTED WRONG MIME TYPE " + res.responseHeader.mime() + " for url " + url.toString());
htCache.status = plasmaHTCache.CACHE_UNFILLED; htCache.status = plasmaHTCache.CACHE_UNFILLED;
} else if ((profile.storeHTCache()) && ((error = htCache.shallStoreCache()) == null)) { } else if ((profile.storeHTCache()) && ((error = htCache.shallStoreCache()) == null)) {
// we write the new cache entry to file system directly // we write the new cache entry to file system directly
cacheFile.getParentFile().mkdirs(); cacheFile.getParentFile().mkdirs();
res.writeContent(hfos, cacheFile); // writes in content scraper and cache file res.writeContent(htCache.getContentOutputStream(), cacheFile); // writes in content scraper and cache file
htCache.status = plasmaHTCache.CACHE_FILL; htCache.status = plasmaHTCache.CACHE_FILL;
} else { } else {
if (error != null) log.logDebug("CRAWLER NOT STORED RESOURCE " + url.toString() + ": " + error); if (error != null) log.logDebug("CRAWLER NOT STORED RESOURCE " + url.toString() + ": " + error);
// anyway, the content still lives in the content scraper // anyway, the content still lives in the content scraper
res.writeContent(hfos, null); // writes only into content scraper res.writeContent(htCache.getContentOutputStream(), null); // writes only into content scraper
htCache.status = plasmaHTCache.CACHE_PASSING; htCache.status = plasmaHTCache.CACHE_PASSING;
} }
// enQueue new entry with response header // enQueue new entry with response header
@ -240,18 +235,18 @@ public class plasmaCrawlLoader {
// but we clean the cache also, since it may be only partial // but we clean the cache also, since it may be only partial
// and most possible corrupted // and most possible corrupted
if (cacheFile.exists()) cacheFile.delete(); if (cacheFile.exists()) cacheFile.delete();
System.out.println("CRAWLER LOADER ERROR1: with url=" + url.toString() + ": " + e.toString()); log.logError("CRAWLER LOADER ERROR1: with url=" + url.toString() + ": " + e.toString());
} }
} else { } else {
// if the response has not the right response type then reject file // if the response has not the right response type then reject file
System.out.println("REJECTED WRONG STATUS TYPE '" + res.status + "' for url " + url.toString()); log.logInfo("REJECTED WRONG STATUS TYPE '" + res.status + "' for url " + url.toString());
// not processed any further // not processed any further
} }
remote.close(); remote.close();
} catch (Exception e) { } catch (Exception e) {
// this may happen if the targeted host does not exist or anything with the // this may happen if the targeted host does not exist or anything with the
// remote server was wrong. // remote server was wrong.
System.out.println("CRAWLER LOADER ERROR2 with url=" + url.toString() + ": " + e.toString()); log.logError("CRAWLER LOADER ERROR2 with url=" + url.toString() + ": " + e.toString());
e.printStackTrace(); e.printStackTrace();
} }
} }

@ -427,11 +427,10 @@ public class plasmaHTCache {
public Entry newEntry(Date initDate, int depth, URL url, public Entry newEntry(Date initDate, int depth, URL url,
httpHeader requestHeader, httpHeader requestHeader,
String responseStatus, httpHeader responseHeader, String responseStatus, httpHeader responseHeader,
htmlFilterContentScraper scraper,
String initiator, String initiator,
plasmaCrawlProfile.entry profile) { plasmaCrawlProfile.entry profile) {
//System.out.println("NEW ENTRY: " + url.toString()); // DEBUG //System.out.println("NEW ENTRY: " + url.toString()); // DEBUG
return new Entry(initDate, depth, url, requestHeader, responseStatus, responseHeader, scraper, initiator, profile); return new Entry(initDate, depth, url, requestHeader, responseStatus, responseHeader, initiator, profile);
} }
public class Entry { public class Entry {
@ -449,15 +448,17 @@ public class plasmaHTCache {
public String urlString; public String urlString;
public int status; // cache load/hit/stale etc status public int status; // cache load/hit/stale etc status
public Date lastModified; public Date lastModified;
public htmlFilterContentScraper scraper;
public char doctype; public char doctype;
public String language; public String language;
public plasmaCrawlProfile.entry profile; public plasmaCrawlProfile.entry profile;
private String initiator; private String initiator;
public ByteArrayOutputStream content;
public htmlFilterContentScraper scraper;
public Entry(Date initDate, int depth, URL url, public Entry(Date initDate, int depth, URL url,
httpHeader requestHeader, httpHeader requestHeader,
String responseStatus, httpHeader responseHeader, String responseStatus, httpHeader responseHeader,
htmlFilterContentScraper scraper,
String initiator, String initiator,
plasmaCrawlProfile.entry profile) { plasmaCrawlProfile.entry profile) {
@ -478,7 +479,7 @@ public class plasmaHTCache {
this.requestHeader = requestHeader; this.requestHeader = requestHeader;
this.responseStatus = responseStatus; this.responseStatus = responseStatus;
this.responseHeader = responseHeader; this.responseHeader = responseHeader;
this.scraper = scraper; this.content = new ByteArrayOutputStream();
this.profile = profile; this.profile = profile;
this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null: initiator); this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null: initiator);
@ -503,8 +504,16 @@ public class plasmaHTCache {
// to be defined later: // to be defined later:
this.cacheArray = null; this.cacheArray = null;
this.status = CACHE_UNFILLED; this.status = CACHE_UNFILLED;
this.scraper = null;
} }
public OutputStream getContentOutputStream() {
return (OutputStream) content;
}
public byte[] getContentBytes() {
try { content.flush(); } catch (IOException e) {}
return content.toByteArray();
}
public String initiator() { public String initiator() {
return initiator; return initiator;
} }
@ -614,8 +623,129 @@ public class plasmaHTCache {
return null; return null;
} }
public String shallIndexCache() { public boolean shallUseCache() {
// decide upon header information if a specific file should be taken from the cache or not
//System.out.println("SHALL READ CACHE: requestHeader = " + requestHeader.toString() + ", responseHeader = " + responseHeader.toString());
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable in caches
if (isPOST(urlString)) return false;
if (isCGI(urlString)) return false;
// -authorization cases in request
if (requestHeader.containsKey("AUTHORIZATION")) return false;
// -ranges in request
// we do not cache partial content
if ((requestHeader != null) && (requestHeader.containsKey("RANGE"))) return false;
//Date d1, d2;
// -if-modified-since in request
// The entity has to be transferred only if it has
// been modified since the date given by the If-Modified-Since header.
if (requestHeader.containsKey("IF-MODIFIED-SINCE")) {
// checking this makes only sense if the cached response contains
// a Last-Modified field. If the field does not exist, we go the safe way
if (!(responseHeader.containsKey("Last-Modified"))) return false;
// parse date
Date d1, d2;
d2 = responseHeader.lastModified(); if (d2 == null) d2 = new Date();
d1 = requestHeader.ifModifiedSince(); if (d1 == null) d1 = new Date();
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if (d2.after(d1)) return false;
}
boolean isNotPicture = !isPicture(responseHeader);
// -cookies in request
// unfortunately, we should reload in case of a cookie
// but we think that pictures can still be considered as fresh
if ((requestHeader.containsKey("COOKIE")) && (isNotPicture)) return false;
// -set-cookie in cached response
// this is a similar case as for COOKIE.
if ((responseHeader.containsKey("SET-COOKIE")) && (isNotPicture)) return false; // too strong
if ((responseHeader.containsKey("SET-COOKIE2")) && (isNotPicture)) return false; // too strong
// -pragma in cached response
// logically, we would not need to care about no-cache pragmas in cached response headers,
// because they cannot exist since they are not written to the cache.
// So this IF should always fail..
if ((responseHeader.containsKey("PRAGMA")) &&
(((String) responseHeader.get("Pragma")).toUpperCase().equals("NO-CACHE"))) return false;
// calculate often needed values for freshness attributes
Date date = responseHeader.date();
Date expires = responseHeader.expires();
Date lastModified = responseHeader.lastModified();
String cacheControl = (String) responseHeader.get("Cache-Control");
// see for documentation also:
// http://www.web-caching.com/cacheability.html
// http://vancouver-webpages.com/CacheNow/
// look for freshnes information
// if we don't have any freshnes indication, we treat the file as stale.
// no handle for freshness control:
if ((expires == null) && (cacheControl == null) && (lastModified == null)) return false;
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
if (expires != null) {
Date yesterday = new Date((new Date()).getTime() - oneday);
if (expires.before(yesterday)) return false;
}
// -lastModified in cached response
// we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read
// of the file and the last modified date as the age of the file. If we consider the file as
// middel-aged then, the maximum TTL would be cache-creation plus age.
// This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache
// file may only be treated as fresh for one more month, not more.
if (lastModified != null) {
if (date == null) date = new Date();
long age = date.getTime() - lastModified.getTime();
if (age < 0) return false;
// TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10
// the actual living-time is new Date().getTime() - d2.getTime()
// therefore the cache is stale, if Date().getTime() - d2.getTime() > age/10
if ((new Date()).getTime() - date.getTime() > age / 10) return false;
}
// -cache-control in cached response
// the cache-control has many value options.
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("PUBLIC")) {
// ok, do nothing
} else if ((cacheControl.startsWith("PRIVATE")) ||
(cacheControl.startsWith("NO-CACHE")) ||
(cacheControl.startsWith("NO-STORE"))) {
// easy case
return false;
} else if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
if (date == null) return false;
try {
long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
if ((new Date()).getTime() - date.getTime() > ttl) {
return false;
}
} catch (Exception e) {
return false;
}
}
}
return true;
}
public String shallIndexCacheForProxy() {
// decide upon header information if a specific file should be indexed // decide upon header information if a specific file should be indexed
// this method returns null if the answer is 'YES'! // this method returns null if the answer is 'YES'!
// if the answer is 'NO' (do not index), it returns a string with the reason // if the answer is 'NO' (do not index), it returns a string with the reason
@ -670,10 +800,8 @@ public class plasmaHTCache {
// thus we do not care about it here for indexing // thus we do not care about it here for indexing
// -pragma in cached response // -pragma in cached response
/*
if ((responseHeader.containsKey("PRAGMA")) && if ((responseHeader.containsKey("PRAGMA")) &&
(((String) responseHeader.get("Pragma")).toUpperCase().equals("NO-CACHE"))) return "Denied_(pragma_no_cache)"; (((String) responseHeader.get("Pragma")).toUpperCase().equals("NO-CACHE"))) return "Denied_(pragma_no_cache)";
*/
// see for documentation also: // see for documentation also:
// http://www.web-caching.com/cacheability.html // http://www.web-caching.com/cacheability.html
@ -732,126 +860,69 @@ public class plasmaHTCache {
return null; return null;
} }
public boolean shallUseCache() {
// decide upon header information if a specific file should be taken from the cache or not public String shallIndexCacheForCrawler() {
// decide upon header information if a specific file should be indexed
//System.out.println("SHALL READ CACHE: requestHeader = " + requestHeader.toString() + ", responseHeader = " + responseHeader.toString()); // this method returns null if the answer is 'YES'!
// if the answer is 'NO' (do not index), it returns a string with the reason
// to reject the crawling demand in clear text
// check profile
if (!(profile.localIndexing())) return "Indexing_Not_Allowed";
// -CGI access in request // -CGI access in request
// CGI access makes the page very individual, and therefore not usable in caches // CGI access makes the page very individual, and therefore not usable in caches
if (isPOST(urlString)) return false; if ((isPOST(urlString)) && (!(profile.crawlingQ()))) return "Dynamic_(POST)";
if (isCGI(urlString)) return false; if ((isCGI(urlString)) && (!(profile.crawlingQ()))) return "Dynamic_(CGI)";
// -authorization cases in request // -authorization cases in request
if (requestHeader.containsKey("AUTHORIZATION")) return false; // we checked that in shallStoreCache
// -ranges in request // -ranges in request
// we do not cache partial content // we checked that in shallStoreCache
if ((requestHeader != null) && (requestHeader.containsKey("RANGE"))) return false;
//Date d1, d2; // a picture cannot be indexed
if (isPicture(responseHeader)) return "Media_Content_(Picture)";
if (!(isText(responseHeader))) return "Media_Content_(not_text)";
if (noIndexingURL(urlString)) return "Media_Content_(forbidden)";
// -if-modified-since in request // -if-modified-since in request
// The entity has to be transferred only if it has // if the page is fresh at the very moment we can index it
// been modified since the date given by the If-Modified-Since header. // -> this does not apply for the crawler
if (requestHeader.containsKey("IF-MODIFIED-SINCE")) {
// checking this makes only sense if the cached response contains
// a Last-Modified field. If the field does not exist, we go the safe way
if (!(responseHeader.containsKey("Last-Modified"))) return false;
// parse date
Date d1, d2;
d2 = responseHeader.lastModified(); if (d2 == null) d2 = new Date();
d1 = requestHeader.ifModifiedSince(); if (d1 == null) d1 = new Date();
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if (d2.after(d1)) return false;
}
boolean isNotPicture = !isPicture(responseHeader);
// -cookies in request // -cookies in request
// unfortunately, we should reload in case of a cookie // unfortunately, we cannot index pages which have been requested with a cookie
// but we think that pictures can still be considered as fresh // because the returned content may be special for the client
if ((requestHeader.containsKey("COOKIE")) && (isNotPicture)) return false; // -> this does not apply for a crawler
// -set-cookie in cached response // -set-cookie in response
// this is a similar case as for COOKIE. // the set-cookie from the server does not indicate that the content is special
if ((responseHeader.containsKey("SET-COOKIE")) && (isNotPicture)) return false; // too strong // thus we do not care about it here for indexing
if ((responseHeader.containsKey("SET-COOKIE2")) && (isNotPicture)) return false; // too strong // -> this does not apply for a crawler
// -pragma in cached response
// logically, we would not need to care about no-cache pragmas in cached response headers,
// because they cannot exist since they are not written to the cache.
// So this IF should always fail..
if ((responseHeader.containsKey("PRAGMA")) &&
(((String) responseHeader.get("Pragma")).toUpperCase().equals("NO-CACHE"))) return false;
// calculate often needed values for freshness attributes
Date date = responseHeader.date();
Date expires = responseHeader.expires();
Date lastModified = responseHeader.lastModified();
String cacheControl = (String) responseHeader.get("Cache-Control");
// see for documentation also:
// http://www.web-caching.com/cacheability.html
// http://vancouver-webpages.com/CacheNow/
// -pragma in cached response
// -> in the crawler we ignore this
// look for freshnes information // look for freshnes information
// if we don't have any freshnes indication, we treat the file as stale.
// no handle for freshness control:
if ((expires == null) && (cacheControl == null) && (lastModified == null)) return false;
// -expires in cached response // -expires in cached response
// the expires value gives us a very easy hint when the cache is stale // the expires value gives us a very easy hint when the cache is stale
if (expires != null) { // sometimes, the expires date is set to the past to prevent that a page is cached
Date yesterday = new Date((new Date()).getTime() - oneday); // we use that information to see if we should index it
if (expires.before(yesterday)) return false; // -> this does not apply for a crawler
}
// -lastModified in cached response // -lastModified in cached response
// we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read // this information is too weak to use it to prevent indexing
// of the file and the last modified date as the age of the file. If we consider the file as // even if we can apply a TTL heuristic for cache usage
// middel-aged then, the maximum TTL would be cache-creation plus age.
// This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache
// file may only be treated as fresh for one more month, not more.
if (lastModified != null) {
if (date == null) date = new Date();
long age = date.getTime() - lastModified.getTime();
if (age < 0) return false;
// TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10
// the actual living-time is new Date().getTime() - d2.getTime()
// therefore the cache is stale, if Date().getTime() - d2.getTime() > age/10
if ((new Date()).getTime() - date.getTime() > age / 10) return false;
}
// -cache-control in cached response // -cache-control in cached response
// the cache-control has many value options. // the cache-control has many value options.
if (cacheControl != null) { // -> in the crawler we ignore this
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("PUBLIC")) {
// ok, do nothing
} else if ((cacheControl.startsWith("PRIVATE")) ||
(cacheControl.startsWith("NO-CACHE")) ||
(cacheControl.startsWith("NO-STORE"))) {
// easy case
return false;
} else if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
if (date == null) return false;
try {
long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
if ((new Date()).getTime() - date.getTime() > ttl) {
return false;
}
} catch (Exception e) {
return false;
}
}
}
return true; return null;
} }
} }
} }

@ -59,17 +59,25 @@ public class plasmaParser {
} }
public document parse(URL location, String mimeType, byte[] source) { public document parseSource(URL location, String mimeType, byte[] source) {
// make a scraper and transformer // make a scraper and transformer
htmlFilterContentScraper scraper = new htmlFilterContentScraper(location); htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false); OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
try { try {
hfos.write(source); hfos.write(source);
return transformScraper(location, mimeType, scraper);
} catch (IOException e) {
return null;
}
}
public document transformScraper(URL location, String mimeType, htmlFilterContentScraper scraper) {
try {
return new document(new URL(urlNormalform(location)), return new document(new URL(urlNormalform(location)),
mimeType, null, null, scraper.getHeadline(), mimeType, null, null, scraper.getHeadline(),
null, null, null, null,
scraper.getText(), scraper.getAnchor(), scraper.getImage()); scraper.getText(), scraper.getAnchors(), scraper.getImages());
} catch (Exception e) { } catch (MalformedURLException e) {
return null; return null;
} }
} }
@ -89,8 +97,6 @@ public class plasmaParser {
return us; return us;
} }
public class document { public class document {
URL location; // the source url URL location; // the source url

@ -147,6 +147,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
public HashSet extensionBlack; public HashSet extensionBlack;
public HashMap outgoingCookies, incomingCookies; public HashMap outgoingCookies, incomingCookies;
public kelondroTables facilityDB; public kelondroTables facilityDB;
public plasmaParser parser;
public int serverJobs; public int serverJobs;
public boolean terminate = false; public boolean terminate = false;
@ -203,28 +204,10 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
// make crawl profiles database and default profiles // make crawl profiles database and default profiles
profiles = new plasmaCrawlProfile(new File(plasmaPath, "crawlProfiles0.db")); profiles = new plasmaCrawlProfile(new File(plasmaPath, "crawlProfiles0.db"));
initProfiles();
//System.out.println("profiles.size=" + profiles.size());
//System.out.println("profile-config=" + getConfig("defaultProxyProfile", "").length()); // make parser
//System.out.println("profile-entry=" + profiles.getEntry(getConfig("defaultProxyProfile", "")).toString()); parser = new plasmaParser(new File(""));
if ((profiles.size() == 0) ||
(getConfig("defaultProxyProfile", "").length() == 0) ||
(profiles.getEntry(getConfig("defaultProxyProfile", "")) == null)) {
// generate new default entry for proxy crawling
defaultProxyProfile = profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), false, true, true, true, false, true, true, true);
setConfig("defaultProxyProfile", defaultProxyProfile.handle());
} else {
defaultProxyProfile = profiles.getEntry(getConfig("defaultProxyProfile", ""));
}
if ((profiles.size() == 1) ||
(getConfig("defaultRemoteProfile", "").length() == 0) ||
(profiles.getEntry(getConfig("defaultRemoteProfile", "")) == null)) {
// generate new default entry for proxy crawling
defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, false, false, true, true, false, true, true, false);
setConfig("defaultRemoteProfile", defaultRemoteProfile.handle());
} else {
defaultRemoteProfile = profiles.getEntry(getConfig("defaultRemoteProfile", ""));
}
// start indexing management // start indexing management
loadedURL = new plasmaCrawlLURL(new File(plasmaPath, "urlHash.db"), ramLURL); loadedURL = new plasmaCrawlLURL(new File(plasmaPath, "urlHash.db"), ramLURL);
@ -309,14 +292,46 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
this.serverJobs = jobs; this.serverJobs = jobs;
} }
private void initProfiles() throws IOException {
if ((profiles.size() == 0) ||
(getConfig("defaultProxyProfile", "").length() == 0) ||
(profiles.getEntry(getConfig("defaultProxyProfile", "")) == null)) {
// generate new default entry for proxy crawling
defaultProxyProfile = profiles.newEntry("proxy", "", ".*", ".*", Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), Integer.parseInt(getConfig("proxyPrefetchDepth", "0")), false, true, true, true, false, true, true, true);
setConfig("defaultProxyProfile", defaultProxyProfile.handle());
} else {
defaultProxyProfile = profiles.getEntry(getConfig("defaultProxyProfile", ""));
}
if ((profiles.size() == 1) ||
(getConfig("defaultRemoteProfile", "").length() == 0) ||
(profiles.getEntry(getConfig("defaultRemoteProfile", "")) == null)) {
// generate new default entry for proxy crawling
defaultRemoteProfile = profiles.newEntry("remote", "", ".*", ".*", 0, 0, false, false, true, true, false, true, true, false);
setConfig("defaultRemoteProfile", defaultRemoteProfile.handle());
} else {
defaultRemoteProfile = profiles.getEntry(getConfig("defaultRemoteProfile", ""));
}
}
private void resetProfiles() {
File pdb = new File(plasmaPath, "crawlProfiles0.db");
if (pdb.exists()) pdb.delete();
try {
profiles = new plasmaCrawlProfile(pdb);
initProfiles();
} catch (IOException e) {}
}
private void cleanProfiles() { private void cleanProfiles() {
if (totalSize() > 0) return; if (totalSize() > 0) return;
Iterator i = profiles.profiles(true); Iterator i = profiles.profiles(true);
plasmaCrawlProfile.entry entry; plasmaCrawlProfile.entry entry;
while (i.hasNext()) { try {
entry = (plasmaCrawlProfile.entry) i.next(); while (i.hasNext()) {
if (!((entry.name().equals("proxy")) || (entry.name().equals("remote")))) i.remove(); entry = (plasmaCrawlProfile.entry) i.next();
} if (!((entry.name().equals("proxy")) || (entry.name().equals("remote")))) i.remove();
}
} catch (kelondroException e) {
resetProfiles();
}
} }
public plasmaHTCache getCacheManager() { public plasmaHTCache getCacheManager() {
@ -454,7 +469,8 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
private synchronized void processResourceStack(plasmaHTCache.Entry entry) { private synchronized void processResourceStack(plasmaHTCache.Entry entry) {
// work off one stack entry with a fresh resource (scraped web page) // work off one stack entry with a fresh resource (scraped web page)
if (entry.scraper != null) try { byte[] content;
if (((content = entry.getContentBytes()).length > 0) || (entry.scraper != null)) try {
// we must distinguish the following cases: resource-load was initiated by // we must distinguish the following cases: resource-load was initiated by
// 1) global crawling: the index is extern, not here (not possible here) // 1) global crawling: the index is extern, not here (not possible here)
// 2) result of search queries, some indexes are here (not possible here) // 2) result of search queries, some indexes are here (not possible here)
@ -479,10 +495,20 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
log.logDebug("processResourceStack: processCase=" + processCase + ", depth=" + entry.depth + ", maxDepth=" + entry.profile.generalDepth() + ", filter=" + entry.profile.generalFilter() + ", initiatorHash=" + initiatorHash + ", status=" + entry.status + ", url=" + entry.url); // DEBUG log.logDebug("processResourceStack: processCase=" + processCase + ", depth=" + entry.depth + ", maxDepth=" + entry.profile.generalDepth() + ", filter=" + entry.profile.generalFilter() + ", initiatorHash=" + initiatorHash + ", status=" + entry.status + ", url=" + entry.url); // DEBUG
// parse content
plasmaParser.document document;
if (entry.scraper != null) {
log.logDebug("(Parser) '" + entry.urlString + "' is pre-parsed by scraper");
document = parser.transformScraper(entry.url, entry.responseHeader.mime(), entry.scraper);
} else {
log.logDebug("(Parser) '" + entry.urlString + "' is not parsed, parsing now");
document = parser.parseSource(entry.url, entry.responseHeader.mime(), content);
}
// put anchors on crawl stack // put anchors on crawl stack
if (((processCase == 4) || (processCase == 5)) && if (((processCase == 4) || (processCase == 5)) &&
(entry.depth < entry.profile.generalDepth())) { (entry.depth < entry.profile.generalDepth())) {
Map hl = entry.scraper.getHyperlinks(); Map hl = document.getHyperlinks();
Iterator i = hl.entrySet().iterator(); Iterator i = hl.entrySet().iterator();
String nexturlstring; String nexturlstring;
String rejectReason; String rejectReason;
@ -500,18 +526,26 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
} }
} }
log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.url.toString() + log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.url.toString() +
", NEW CRAWL STACK SIZE IS " + noticeURL.localStackSize()); ", NEW CRAWL STACK SIZE IS " + noticeURL.localStackSize());
} }
// create index // create index
String noIndexReason;
String descr = entry.scraper.getHeadline(); String descr = document.getMainLongTitle();
URL referrerURL = entry.referrerURL(); URL referrerURL = entry.referrerURL();
String referrerHash = (referrerURL == null) ? plasmaURL.dummyHash : plasmaURL.urlHash(referrerURL); String referrerHash = (referrerURL == null) ? plasmaURL.dummyHash : plasmaURL.urlHash(referrerURL);
if ((noIndexReason = entry.shallIndexCache()) == null ) { String noIndexReason = "unspecified";
if (processCase == 4) {
// proxy-load
noIndexReason = entry.shallIndexCacheForProxy();
} else {
// normal crawling
noIndexReason = entry.shallIndexCacheForCrawler();
}
if (noIndexReason == null) {
// strip out words // strip out words
log.logDebug("(Profile) Condensing for '" + entry.urlString + "'"); log.logDebug("(Profile) Condensing for '" + entry.urlString + "'");
plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(entry.scraper.getText())); plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(document.getText()));
//log.logInfo("INDEXING HEADLINE:" + descr); //log.logInfo("INDEXING HEADLINE:" + descr);
try { try {
@ -573,7 +607,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
} }
// explicit delete/free resources // explicit delete/free resources
entry.scraper = null; entry = null; document = null; entry = null;
} catch (IOException e) { } catch (IOException e) {
log.logError("ERROR in plasmaSwitchboard.process(): " + e.toString()); log.logError("ERROR in plasmaSwitchboard.process(): " + e.toString());
} }
@ -1310,6 +1344,10 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
log.logError("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage()); log.logError("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage());
e.printStackTrace(); e.printStackTrace();
return new plasmaWordIndexEntity[0]; return new plasmaWordIndexEntity[0];
} catch (kelondroException e) {
log.logError("selectTransferIndexes database corrupted: " + e.getMessage());
e.printStackTrace();
return new plasmaWordIndexEntity[0];
} }
} }

@ -62,6 +62,7 @@ package de.anomic.plasma;
import java.io.*; import java.io.*;
import java.util.*; import java.util.*;
import de.anomic.server.*;
import de.anomic.kelondro.*; import de.anomic.kelondro.*;
public class plasmaWordIndexFileCache { public class plasmaWordIndexFileCache {
@ -72,24 +73,43 @@ public class plasmaWordIndexFileCache {
// class variables // class variables
private File databaseRoot; private File databaseRoot;
private kelondroTree indexCache; private kelondroTree indexCache;
private int bufferkb;
public plasmaWordIndexFileCache(File databaseRoot, int bufferkb) throws IOException { public plasmaWordIndexFileCache(File databaseRoot, int bufferkb) throws IOException {
this.databaseRoot = databaseRoot; this.databaseRoot = databaseRoot;
this.bufferkb = bufferkb;
File indexCacheFile = new File(databaseRoot, indexCacheFileName); File indexCacheFile = new File(databaseRoot, indexCacheFileName);
if (indexCacheFile.exists()) { if (indexCacheFile.exists()) {
// simply open the file // simply open the file
indexCache = new kelondroTree(indexCacheFile, bufferkb * 0x400); indexCache = new kelondroTree(indexCacheFile, bufferkb * 0x400);
} else { } else {
// create a new file createCacheFile(indexCacheFile);
int[] columns = new int[buffers + 2];
columns[0] = plasmaWordIndexEntry.wordHashLength;
columns[1] = 1;
for (int i = 0; i < buffers; i++) columns[i + 2] = plasmaCrawlLURL.urlHashLength + plasmaWordIndexEntry.attrSpaceShort;
indexCache = new kelondroTree(indexCacheFile, bufferkb * 0x400, columns);
} }
} }
private void resetCacheFile() {
// this has to be used in emergencies only
// it can happen that there is a serious db inconsistency; in that case we re-create the indexCache
try { indexCache.close(); } catch (IOException e) {}
File indexCacheFile = new File(databaseRoot, indexCacheFileName);
if (indexCacheFile.exists()) indexCacheFile.delete();
try {
createCacheFile(indexCacheFile);
} catch (IOException e) {
de.anomic.server.serverLog.logError("PLASMA", "plasmaWordIndexFileCache.resetCacheFile: serious failure creating the cache file: " + e.getMessage());
indexCache = null;
}
}
private void createCacheFile(File indexCacheFile) throws IOException {
// create a new file
int[] columns = new int[buffers + 2];
columns[0] = plasmaWordIndexEntry.wordHashLength;
columns[1] = 1;
for (int i = 0; i < buffers; i++) columns[i + 2] = plasmaCrawlLURL.urlHashLength + plasmaWordIndexEntry.attrSpaceShort;
indexCache = new kelondroTree(indexCacheFile, bufferkb * 0x400, columns);
}
protected void close() throws IOException { protected void close() throws IOException {
indexCache.close(); indexCache.close();
indexCache = null; indexCache = null;
@ -162,8 +182,12 @@ public class plasmaWordIndexFileCache {
indexCache.put(row); indexCache.put(row);
} catch (kelondroException e) { } catch (kelondroException e) {
// this is a very bad case; a database inconsistency occurred // this is a very bad case; a database inconsistency occurred
deleteComplete(wordHash); serverLog.logError("PLASMA", "fatal error in plasmaWordIndexFileCache.addEntriesToIndex: write of " + wordHash + " to index cache failed - " + e.getMessage() + " - indexCache.db deleted");
System.out.println("fatal error in plasmaWordIndexFileCacle.addEntriesToIndex: write to word hash file " + wordHash + " failed - " + e.getMessage() + " - index deleted."); resetCacheFile();
} catch (IOException e) {
// this is a very bad case; a database inconsistency occurred
serverLog.logError("PLASMA", "fatal error in plasmaWordIndexFileCache.addEntriesToIndex: write of " + wordHash + " to index cache failed - " + e.getMessage() + " - indexCache.db deleted");
resetCacheFile();
} }
} }
// finished! // finished!

@ -208,7 +208,7 @@ public class yacy {
} }
// init parser // init parser
de.anomic.htmlFilter.htmlFilterContentScraper.mediaExt = sb.getConfig("mediaExt",""); de.anomic.plasma.plasmaParser.mediaExt = sb.getConfig("mediaExt","");
// start main threads // start main threads
try { try {

@ -0,0 +1 @@
testblue

@ -166,16 +166,8 @@ remoteProxyUse=false
#remoteProxyUse=true #remoteProxyUse=true
# the proxy may filter the content of transferred web pages # the proxy may filter the content of transferred web pages
# this is archieved using a special filtering class that can be # the bluelist removes specific keywords from web pages
# exchanged like a transformation plug-in proxyBlueList=yacy.blue
# If you want to do this, you must implement the htmlFilterTransformer
# -Interface and set the name of the implementing class here.
# As a default, we use a filtering Transformer that takes a blacklist
# and blocks all text fragments where a word from the blacklist appears
# as the blacklist, we use the search-engine's blue-list
# please see that class as an implementation example for your own transformers
pageTransformerClass=htmlFilterContentTransformer
pageTransformerArg=yacy.blue
# security settigns # security settigns
# we provide proxy and server security through a 2-stage security gate: # we provide proxy and server security through a 2-stage security gate:

Loading…
Cancel
Save