fixed post-parsing (a case when the bluelist is empty)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@41 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 044b93412a
commit 00f223cfc1

@ -15,10 +15,11 @@ document.getElementById("value").value=element.value;
<body marginheight="0" marginwidth="0" leftmargin="0" topmargin="0">
#[header]#
<br><br>
<h2>advanced Config</h2>
<h2>Advanced Config</h2>
<p>
Here are all Config Options from YaCy.<br>
You can change anything, but some Options need a restart, and some Options can crash YaCy, when wrong values are used.
For explanation please look into yacy.init
</p>
<select name="options" size="25" style="width: 600">
#{options}#<option id="#[key]#" value="#[value]#" onclick="element_clicked(this)">#[key]#: #[value]#</option>

@ -45,7 +45,7 @@
# Contributions and changes to the program code must be marked as such.
# define variables
version='0.363'
version='0.365'
datestr=`date +%Y%m%d`
#release='yacy_v'$version'_'$datestr
release='yacy_dev_v'$version'_'$datestr

@ -468,7 +468,7 @@ public final class httpc {
// and change the Content-Encoding and Content-Length attributes in the header
byte[] buffer = new byte[2048];
int l;
long len = 0;
long len = 0;
// find out length
long length = responseHeader.contentLength();

@ -348,16 +348,16 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
// attach possible yacy-sublevel-domain
if ((yAddress != null) &&
((pos = yAddress.indexOf("/")) >= 0) &&
(!(remotePath.startsWith("/env"))) // this is the special path, staying always at root-level
) remotePath = yAddress.substring(pos) + remotePath;
((pos = yAddress.indexOf("/")) >= 0) &&
(!(remotePath.startsWith("/env"))) // this is the special path, staying always at root-level
) remotePath = yAddress.substring(pos) + remotePath;
// decide wether to use a cache entry or connect to the network
File cacheFile = cacheManager.getCachePath(url);
String urlHash = plasmaCrawlLURL.urlHash(url);
httpHeader cachedResponseHeader = null;
boolean cacheExists = ((cacheFile.exists()) && (cacheFile.isFile()) &&
((cachedResponseHeader = cacheManager.getCachedResponse(urlHash)) != null));
((cachedResponseHeader = cacheManager.getCachedResponse(urlHash)) != null));
// why are files unzipped upon arrival? why not zip all files in cache?
// This follows from the following premises
@ -381,9 +381,9 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
// 4. cache stale - refill - superfluous
// in two of these cases we trigger a scheduler to handle newly arrived files:
// case 1 and case 3
plasmaHTCache.Entry hpc;
plasmaHTCache.Entry cacheEntry;
if ((cacheExists) &&
((hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK",
((cacheEntry = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK",
cachedResponseHeader, null,
switchboard.defaultProxyProfile)).shallUseCache())) {
// we respond on the request by using the cache, the cache is fresh
@ -466,29 +466,34 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
long contentLength = res.responseHeader.contentLength();
// reserver cache entry
hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile);
cacheEntry = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile);
// handle file types
if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
(httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) {
// this is a file that is a possible candidate for parsing by the indexer
if (transformer.isIdentityTransformer()) {
log.logDebug("create passthrough (parse candidate) for url " + url);
// no transformation, only passthrough
// this is especially the case if the bluelist is empty
// in that case, the content is not scraped here but later
hfos = respond;
} else {
// make a scraper and transformer
log.logDebug("create scraper for url " + url);
scraper = new htmlFilterContentScraper(url);
hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0));
if (((htmlFilterOutputStream) hfos).binarySuspect()) {
scraper = null; // forget it, may be rubbish
log.logDebug("Content of " + url + " is probably binary. deleted scraper.");
}
hpc.scraper = scraper;
cacheEntry.scraper = scraper;
}
} else {
log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped");
scraper = null;
hfos = respond;
hpc.scraper = scraper;
cacheEntry.scraper = scraper;
}
// handle incoming cookies
@ -498,47 +503,52 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
try {
respondHeader(respond, res.status, res.responseHeader);
String storeError;
if ((storeError = hpc.shallStoreCache()) == null) {
if ((storeError = cacheEntry.shallStoreCache()) == null) {
// we write a new cache entry
if ((contentLength > 0) && // known
(contentLength < 1048576)) {// 1 MB
// ok, we don't write actually into a file, only to RAM, and schedule writing the file.
byte[] cacheArray;
cacheArray = res.writeContent(hfos);
byte[] cacheArray = res.writeContent(hfos);
log.logDebug("writeContent of " + url + " produced cacheArray = " + ((cacheArray == null) ? "null" : ("size=" + cacheArray.length)));
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
if (sizeBeforeDelete == -1) {
// totally fresh file
hpc.status = plasmaHTCache.CACHE_FILL; // it's an insert
cacheManager.stackProcess(hpc, cacheArray);
cacheEntry.status = plasmaHTCache.CACHE_FILL; // it's an insert
cacheManager.stackProcess(cacheEntry, cacheArray);
} else if (sizeBeforeDelete == cacheArray.length) {
// before we came here we deleted a cache entry
cacheArray = null;
hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
cacheManager.stackProcess(hpc); // unnecessary update
cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
cacheManager.stackProcess(cacheEntry); // unnecessary update
} else {
// before we came here we deleted a cache entry
hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
cacheManager.stackProcess(hpc, cacheArray); // necessary update, write response header to cache
cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
cacheManager.stackProcess(cacheEntry, cacheArray); // necessary update, write response header to cache
}
} else {
// the file is too big to cache it in the ram, write to file right here
// the file is too big to cache it in the ram, or the size is unknown
// write to file right here.
cacheFile.getParentFile().mkdirs();
res.writeContent(hfos, cacheFile);
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
log.logDebug("for write-file of " + url + ": contentLength = " + contentLength + ", sizeBeforeDelete = " + sizeBeforeDelete);
if (sizeBeforeDelete == -1) {
// totally fresh file
hpc.status = plasmaHTCache.CACHE_FILL; // it's an insert
cacheManager.stackProcess(hpc);
cacheEntry.status = plasmaHTCache.CACHE_FILL; // it's an insert
cacheManager.stackProcess(cacheEntry);
} else if (sizeBeforeDelete == cacheFile.length()) {
// before we came here we deleted a cache entry
hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
cacheManager.stackProcess(hpc); // unnecessary update
cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
cacheManager.stackProcess(cacheEntry); // unnecessary update
} else {
// before we came here we deleted a cache entry
hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
cacheManager.stackProcess(hpc); // necessary update, write response header to cache
cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
cacheManager.stackProcess(cacheEntry); // necessary update, write response header to cache
}
// beware! all these writings will not fill the cacheEntry.cacheArray
// that means they are not available for the indexer (except they are scraped before)
}
} else {
// no caching
@ -547,12 +557,12 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
if (sizeBeforeDelete == -1) {
// no old file and no load. just data passing
hpc.status = plasmaHTCache.CACHE_PASSING;
cacheManager.stackProcess(hpc);
cacheEntry.status = plasmaHTCache.CACHE_PASSING;
cacheManager.stackProcess(cacheEntry);
} else {
// before we came here we deleted a cache entry
hpc.status = plasmaHTCache.CACHE_STALE_NO_RELOAD;
cacheManager.stackProcess(hpc);
cacheEntry.status = plasmaHTCache.CACHE_STALE_NO_RELOAD;
cacheManager.stackProcess(cacheEntry);
}
}
} catch (SocketException e) {

@ -251,7 +251,9 @@ public final class plasmaHTCache {
entry.cacheFile.delete();
}
entry.cacheFile.getParentFile().mkdirs();
log.logInfo("WRITE FILE (" + entry.cacheArray.length + " bytes) " + entry.cacheFile);
serverFileUtils.write(entry.cacheArray, entry.cacheFile);
log.logInfo("AFTER WRITE cacheArray = " + entry.cacheFile + ": " + ((entry.cacheArray == null) ? "empty" : "full"));
//entry.cacheArray = null;
} catch (FileNotFoundException e) {
// this is the case of a "(Not a directory)" error, which should be prohibited
@ -280,7 +282,10 @@ public final class plasmaHTCache {
case CACHE_UNFILLED:
log.logInfo("CACHE UNFILLED: " + entry.cacheFile); break;
case CACHE_FILL:
log.logInfo("CACHE FILL: " + entry.cacheFile); break;
log.logInfo("CACHE FILL: " + entry.cacheFile +
((entry.cacheArray == null) ? "" : " (cacheArray is filled)") +
((entry.scraper == null) ? "" : " (scraper is filled)"));
break;
case CACHE_HIT:
log.logInfo("CACHE HIT: " + entry.cacheFile); break;
case CACHE_STALE_NO_RELOAD:

@ -42,10 +42,11 @@
package de.anomic.plasma;
import de.anomic.htmlFilter.*;
import java.io.*;
import java.net.*;
import java.util.*;
import de.anomic.server.*;
import de.anomic.htmlFilter.*;
public class plasmaParser {
@ -70,6 +71,18 @@ public class plasmaParser {
return null;
}
}
public document parseSource(URL location, String mimeType, File sourceFile) {
// make a scraper and transformer
htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
try {
serverFileUtils.copy(sourceFile, hfos);
return transformScraper(location, mimeType, scraper);
} catch (IOException e) {
return null;
}
}
public document transformScraper(URL location, String mimeType, htmlFilterContentScraper scraper) {
try {

@ -469,10 +469,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
private synchronized void processResourceStack(plasmaHTCache.Entry entry) {
// work off one stack entry with a fresh resource (scraped web page)
String stats = "DEQUEUE: dequeueing one step (processStack=" + processStack.size() + ", localStackSize=" + noticeURL.localStackSize() + ", remoteStackSize=" + noticeURL.remoteStackSize() + ")";
if ((entry.cacheArray == null) && (entry.scraper == null)) {
log.logDebug(stats + " entry for " + entry.nomalizedURLString + " has no content -- skipped");
return;
}
try {
// we must distinguish the following cases: resource-load was initiated by
@ -504,10 +501,22 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
if (entry.scraper != null) {
log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is pre-parsed by scraper");
document = parser.transformScraper(entry.url, entry.responseHeader.mime(), entry.scraper);
} else {
log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is not parsed, parsing now");
} else if (entry.cacheArray != null) {
log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is not parsed yet, parsing now from cacheArray");
document = parser.parseSource(entry.url, entry.responseHeader.mime(), entry.cacheArray);
}
} else {
if (entry.cacheFile.exists()) {
log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is not parsed yet, parsing now from File");
document = parser.parseSource(entry.url, entry.responseHeader.mime(), entry.cacheFile);
} else {
log.logDebug("(Parser) '" + entry.nomalizedURLString + "' cannot be parsed, no resource available");
return;
}
}
if (document == null) {
log.logError("(Parser) '" + entry.nomalizedURLString + "' parse failure");
return;
}
// put anchors on crawl stack
if (((processCase == 4) || (processCase == 5)) &&

@ -1 +0,0 @@
testblue

@ -92,7 +92,7 @@ parseableMime=application/xhtml+xml,text/html,text/plain
# a comma-separated list of extensions that denote media file formats
# this is important to recognize <a href> - tags as not-html reference
# These files will be excluded from indexing
mediaExt=swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar,sit,hqx,img,dmg,tar,gz,ps,pdf,doc,xls,ppt,ram,bz2,arj,jar,deb,torrent,ogg,iso,bin,ace,tgz,rpm
mediaExt=swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar,sit,hqx,img,dmg,tar,gz,ps,pdf,doc,xls,ppt,ram,bz2,arj,jar,deb,torrent,ogg,iso,bin,ace,tgz,rpm,css
# the proxy's and indexing maximum ram cache size in megabytes
ramCacheSize = 12

Loading…
Cancel
Save