fixed bugs/missing code regarding new crawl stack

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@384 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 9a7c4080d5
commit 419f8fb398

@ -3,7 +3,7 @@ javacSource=1.4
javacTarget=1.4
# Release Configuration
releaseVersion=0.385
releaseVersion=0.386
releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
#releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}

@ -20,7 +20,6 @@ There are #[num]# entries in the indexing queue:<br>
<th class="small">Initiator</th>
<th class="small">Depth</th>
<th class="small">Modified Date</th>
<th class="small">#HREF</th>
<td class="small">Anchor Name</th>
<th class="small">URL</th>
</tr>
@ -29,7 +28,6 @@ There are #[num]# entries in the indexing queue:<br>
<td width="60" class="small">#[initiator]#</td>
<td width="10" class="small">#[depth]#</td>
<td width="80" class="small">#[modified]#</td>
<td width="10" class="small">#[href]#</td>
<td width="180" class="small">#[anchor]#</td>
<td class="small"><a class="small" href="#[url]#">#[url]#</a></td>
</tr>

@ -100,7 +100,6 @@ public class IndexCreateIndexingQueue_p {
prop.put("indexing-queue_list_"+i+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("indexing-queue_list_"+i+"_depth", pcentry.depth());
prop.put("indexing-queue_list_"+i+"_modified", (pcentry.responseHeader() == null) ? "null" : daydate(pcentry.responseHeader().lastModified()));
prop.put("indexing-queue_list_"+i+"_href", pcentry.forkFactor());
prop.put("indexing-queue_list_"+i+"_anchor", pcentry.anchorName());
prop.put("indexing-queue_list_"+i+"_url", pcentry.normalizedURLString());
dark = !dark;

@ -443,6 +443,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
requestDate, // init date
0, // crawling depth
url, // url
"", // name of the url is unknown
requestHeader, // request headers
"200 OK", // request status
cachedResponseHeader, // response headers
@ -486,7 +487,6 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
GZIPOutputStream gzippedOut = null;
httpChunkedOutputStream chunkedOut = null;
OutputStream hfos = null;
htmlFilterContentScraper scraper = null;
httpc remote = null;
httpc.response res = null;
@ -568,7 +568,8 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
plasmaHTCache.Entry cacheEntry = cacheManager.newEntry(
requestDate,
0,
url,
url,
"",
requestHeader,
res.status,
res.responseHeader,
@ -576,33 +577,8 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
switchboard.defaultProxyProfile
);
// handle file types
if (((ext == null) || (!(plasmaParser.mediaExtContains(ext)))) &&
(plasmaParser.realtimeParsableMimeTypesContains(res.responseHeader.mime()))) {
// this is a file that is a possible candidate for parsing by the indexer
if (transformer.isIdentityTransformer()) {
this.theLogger.logDebug("create passthrough (parse candidate) for url " + url);
// no transformation, only passthrough
// this isng especially the case if the bluelist is empty
// in that case, the content is not scraped here but later
hfos = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond);
} else {
// make a scraper and transformer
this.theLogger.logDebug("create scraper for url " + url);
scraper = new htmlFilterContentScraper(url);
hfos = new htmlFilterOutputStream((gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond), scraper, transformer, (ext.length() == 0));
if (((htmlFilterOutputStream) hfos).binarySuspect()) {
scraper = null; // forget it, may be rubbish
this.theLogger.logDebug("Content of " + url + " is probably binary. deleted scraper.");
}
cacheEntry.scraper = scraper;
}
} else {
this.theLogger.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped");
scraper = null;
hfos = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond);
cacheEntry.scraper = scraper;
}
// make output stream
hfos = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond);
// handle incoming cookies
handleIncomingCookies(res.responseHeader, host, ip);

@ -132,15 +132,15 @@ public final class plasmaCrawlLoader extends Thread {
public void close() {
try {
// setting the stop flag to true
this.stopped = true;
// interrupting the plasmaCrawlLoader
this.interrupt();
// waiting for the thread to finish ...
// setting the stop flag to true
this.stopped = true;
// interrupting the plasmaCrawlLoader
this.interrupt();
// waiting for the thread to finish ...
this.log.logInfo("Waiting for plasmaCrawlLoader shutdown ...");
this.join(5000);
this.join(5000);
} catch (Exception e) {
// we where interrupted while waiting for the crawlLoader Thread to finish
}
@ -186,6 +186,7 @@ public final class plasmaCrawlLoader extends Thread {
public void loadParallel(
URL url,
String name,
String referer,
String initiator,
int depth,
@ -195,7 +196,7 @@ public final class plasmaCrawlLoader extends Thread {
int crawlingPriority = 5;
// creating a new crawler queue object
plasmaCrawlLoaderMessage theMsg = new plasmaCrawlLoaderMessage(url, referer,initiator,depth,profile, crawlingPriority);
plasmaCrawlLoaderMessage theMsg = new plasmaCrawlLoaderMessage(url, name, referer, initiator, depth, profile, crawlingPriority);
// adding the message to the queue
try {
@ -287,8 +288,7 @@ class CrawlerMessageQueue {
}
final class CrawlerPool extends GenericObjectPool
{
final class CrawlerPool extends GenericObjectPool {
private final ThreadGroup theThreadGroup;
public boolean isClosed = false;

@ -47,6 +47,7 @@ import java.net.URL;
public final class plasmaCrawlLoaderMessage {
public final int crawlingPriority;
public final URL url;
public final String name;
public final String referer;
public final String initiator;
public final int depth;
@ -54,13 +55,15 @@ public final class plasmaCrawlLoaderMessage {
// loadParallel(URL url, String referer, String initiator, int depth, plasmaCrawlProfile.entry profile) {
public plasmaCrawlLoaderMessage(
URL url,
URL url,
String name,
String referer,
String initiator,
int depth,
plasmaCrawlProfile.entry profile,
int crawlingPriority) {
this.url = url;
this.name = name;
this.referer = referer;
this.initiator = initiator;
this.depth = depth;

@ -71,6 +71,7 @@ public final class plasmaCrawlWorker extends Thread {
public plasmaCrawlLoaderMessage theMsg;
private URL url;
private String name;
private String referer;
private String initiator;
private int depth;
@ -125,6 +126,7 @@ public final class plasmaCrawlWorker extends Thread {
this.theMsg = theMsg;
this.url = theMsg.url;
this.name = theMsg.name;
this.referer = theMsg.referer;
this.initiator = theMsg.initiator;
this.depth = theMsg.depth;
@ -198,7 +200,7 @@ public final class plasmaCrawlWorker extends Thread {
public void execute() throws IOException {
try {
this.setName(this.threadBaseName + "_" + this.url);
load(this.url, this.referer, this.initiator, this.depth, this.profile,
load(this.url, this.name, this.referer, this.initiator, this.depth, this.profile,
this.socketTimeout, this.remoteProxyHost, this.remoteProxyPort, this.remoteProxyUse,
this.cacheManager, this.log);
@ -220,6 +222,7 @@ public final class plasmaCrawlWorker extends Thread {
public static void load(
URL url,
String name,
String referer,
String initiator,
int depth,
@ -232,6 +235,7 @@ public final class plasmaCrawlWorker extends Thread {
serverLog log
) throws IOException {
load(url,
name,
referer,
initiator,
depth,
@ -248,7 +252,8 @@ public final class plasmaCrawlWorker extends Thread {
}
private static void load(
URL url,
URL url,
String name,
String referer,
String initiator,
int depth,
@ -300,7 +305,7 @@ public final class plasmaCrawlWorker extends Thread {
long contentLength = res.responseHeader.contentLength();
// reserve cache entry
plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, requestHeader, res.status, res.responseHeader, initiator, profile);
plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, name, requestHeader, res.status, res.responseHeader, initiator, profile);
// request has been placed and result has been returned. work off response
File cacheFile = cacheManager.getCachePath(url);
@ -355,6 +360,7 @@ public final class plasmaCrawlWorker extends Thread {
log.logInfo("Redirection detected ('" + res.status + "') for url " + url.toString() +
"\nRedirecting request to: " + redirectionUrl);
load(redirectionUrl,
name,
referer,
initiator,
depth,
@ -383,18 +389,19 @@ public final class plasmaCrawlWorker extends Thread {
log.logWarning("Problems detected while receiving gzip encoded content from '" + url.toString() +
"'. Retrying request without using gzip content encoding.");
load(url,
referer,
initiator,
depth,
profile,
socketTimeout,
remoteProxyHost,
remoteProxyPort,
remoteProxyUse,
cacheManager,
log,
0,
false
name,
referer,
initiator,
depth,
profile,
socketTimeout,
remoteProxyHost,
remoteProxyPort,
remoteProxyUse,
cacheManager,
log,
0,
false
);
} else {
// this may happen if the targeted host does not exist or anything with the

@ -431,13 +431,12 @@ public final class plasmaHTCache {
(urlString.toLowerCase().indexOf(".exe") >= 0));
}
public Entry newEntry(Date initDate, int depth, URL url,
public Entry newEntry(Date initDate, int depth, URL url, String name,
httpHeader requestHeader,
String responseStatus, httpHeader responseHeader,
String initiator,
plasmaCrawlProfile.entry profile) {
//System.out.println("NEW ENTRY: " + url.toString()); // DEBUG
return new Entry(initDate, depth, url, requestHeader, responseStatus, responseHeader, initiator, profile);
return new Entry(initDate, depth, url, name, requestHeader, responseStatus, responseHeader, initiator, profile);
}
public final class Entry {
@ -451,6 +450,7 @@ public final class plasmaHTCache {
public File cacheFile; // the cache file
public byte[] cacheArray; // or the cache as byte-array
public URL url;
public String name; // the name of the link, read as anchor from an <a>-tag
public String nomalizedURLHash;
public String nomalizedURLString;
public int status; // cache load/hit/stale etc status
@ -459,10 +459,9 @@ public final class plasmaHTCache {
public String language;
public plasmaCrawlProfile.entry profile;
private String initiator;
public htmlFilterContentScraper scraper;
public Entry(Date initDate, int depth, URL url,
public Entry(Date initDate, int depth, URL url, String name,
httpHeader requestHeader,
String responseStatus, httpHeader responseHeader,
String initiator,
@ -476,6 +475,7 @@ public final class plasmaHTCache {
System.out.println("internal error at httpdProxyCache.Entry: " + e);
System.exit(-1);
}
this.name = name;
this.cacheFile = getCachePath(this.url);
this.nomalizedURLHash = plasmaCrawlLURL.urlHash(nomalizedURLString);
@ -510,9 +510,11 @@ public final class plasmaHTCache {
// to be defined later:
this.cacheArray = null;
this.status = CACHE_UNFILLED;
this.scraper = null;
}
public String name() {
return name;
}
public String initiator() {
return initiator;
}

@ -354,7 +354,8 @@ public class plasmaSnippetCache {
private void loadResourceFromWeb(URL url, int socketTimeout) throws IOException {
plasmaCrawlWorker.load(
url,
url,
"",
null,
null,
0,

@ -264,7 +264,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
this.parser = new plasmaParser();
// initialize switchboard queue
sbQueue = new plasmaSwitchboardQueue(this.cacheManager, urlPool.loadedURL, new File(plasmaPath, "switchboardQueue0.stack"), 10, profiles);
sbQueue = new plasmaSwitchboardQueue(this.cacheManager, urlPool.loadedURL, new File(plasmaPath, "switchboardQueue1.stack"), 10, profiles);
// define an extension-blacklist
log.logSystem("Parser: Initializing Extension Mappings for Media/Parser");
@ -453,8 +453,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
cacheManager.push(entry);
}
synchronized public boolean htEntryStoreProcess(plasmaHTCache.Entry entry) throws IOException {
if (entry == null) return false;
@ -480,9 +478,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
enQueue(sbQueue.newEntry(entry.url, plasmaURL.urlHash(entry.referrerURL()),
entry.requestHeader.ifModifiedSince(), entry.requestHeader.containsKey(httpHeader.COOKIE),
entry.initiator(), entry.depth, entry.profile.handle(),
(entry.scraper == null) ? 0 : entry.scraper.getAnchors().size(),
(entry.scraper == null) ? 0 : entry.scraper.getImages().size(),
(entry.scraper == null) ? "" : entry.scraper.getHeadline()
entry.name()
));
} else if (entry.status == plasmaHTCache.CACHE_PASSING) {
// even if the file should not be stored in the cache, it can be used to be indexed
@ -492,9 +488,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
enQueue(sbQueue.newEntry(entry.url, plasmaURL.urlHash(entry.referrerURL()),
entry.requestHeader.ifModifiedSince(), entry.requestHeader.containsKey(httpHeader.COOKIE),
entry.initiator(), entry.depth, entry.profile.handle(),
(entry.scraper == null) ? 0 : entry.scraper.getAnchors().size(),
(entry.scraper == null) ? 0 : entry.scraper.getImages().size(),
(entry.scraper == null) ? "" : entry.scraper.getHeadline()
entry.name()
));
}
@ -504,9 +498,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
case plasmaHTCache.CACHE_UNFILLED:
log.logInfo("CACHE UNFILLED: " + entry.cacheFile); break;
case plasmaHTCache.CACHE_FILL:
log.logInfo("CACHE FILL: " + entry.cacheFile +
((entry.cacheArray == null) ? "" : " (cacheArray is filled)") +
((entry.scraper == null) ? "" : " (scraper is filled)"));
log.logInfo("CACHE FILL: " + entry.cacheFile + ((entry.cacheArray == null) ? "" : " (cacheArray is filled)"));
break;
case plasmaHTCache.CACHE_HIT:
log.logInfo("CACHE HIT: " + entry.cacheFile); break;
@ -574,27 +566,16 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
profiles.close();
parser.close();
cacheManager.close();
sbQueue.close();
} catch (IOException e) {}
log.logSystem("SWITCHBOARD SHUTDOWN TERMINATED");
}
/*
public int totalSize() {
return processStack.size() + cacheLoader.size() + noticeURL.stackSize();
}
*/
public int queueSize() {
return sbQueue.size();
//return processStack.size() + cacheLoader.size() + noticeURL.stackSize();
}
/*
public int lUrlSize() {
return urlPool.loadedURL.size();
}
*/
public int cacheSizeMin() {
return wordIndex.size();
}
@ -812,13 +793,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// alternatively do a local crawl
if (sbQueue.size() >= crawlSlots) {
log.logDebug("LimitCrawl: too many processes in queue, dismissed (" +
if (sbQueue.size() >= indexingSlots) {
log.logDebug("LimitCrawl: too many processes in indexing queue, dismissed (" +
"sbQueueSize=" + sbQueue.size() + ")");
return false;
}
if (cacheLoader.size() >= crawlSlots) {
log.logDebug("LimitCrawl: too many loader in queue, dismissed (" +
log.logDebug("LimitCrawl: too many processes in loader queue, dismissed (" +
"cacheLoader=" + cacheLoader.size() + ")");
return false;
}
@ -924,7 +905,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// parse content
plasmaParserDocument document = null;
if ((plasmaParser.supportedFileExt(entry.url())) ||
((entry.responseHeader() != null) &&
(plasmaParser.supportedMimeTypesContains(entry.responseHeader().mime())))) {
@ -944,8 +924,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
return;
}
Date loadDate = entry.responseHeader().lastModified();
if (loadDate == null) loadDate = entry.responseHeader().date();
Date loadDate = null;
if (entry.responseHeader() != null) {
loadDate = entry.responseHeader().lastModified();
if (loadDate == null) loadDate = entry.responseHeader().date();
}
if (loadDate == null) loadDate = new Date();
// put anchors on crawl stack
@ -1055,7 +1038,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// explicit delete/free resources
if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) cacheManager.deleteFile(entry.url());
document = null; entry = null;
} catch (IOException e) {
log.logError("ERROR in plasmaSwitchboard.process(): " + e.toString());
}
@ -1166,7 +1152,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logInfo(stats + ": urlEntry=null");
return;
}
cacheLoader.loadParallel(urlEntry.url(), urlEntry.referrerHash(), urlEntry.initiator(), urlEntry.depth(), profile);
cacheLoader.loadParallel(urlEntry.url(), urlEntry.name(), urlEntry.referrerHash(), urlEntry.initiator(), urlEntry.depth(), profile);
log.logInfo(stats + ": enqueued for load " + urlEntry.url());
return;
}

@ -76,8 +76,6 @@ public class plasmaSwitchboardQueue {
yacySeedDB.commonHashLength,
plasmaURL.urlCrawlDepthLength,
plasmaURL.urlCrawlProfileHandleLength,
plasmaURL.urlForkFactorLength,
plasmaURL.urlForkFactorLength,
plasmaURL.urlDescrLength
});
@ -96,8 +94,6 @@ public class plasmaSwitchboardQueue {
(entry.initiator == null) ? plasmaURL.dummyHash.getBytes() : entry.initiator.getBytes(),
serverCodings.enhancedCoder.encodeBase64Long((long) entry.depth, plasmaURL.urlCrawlDepthLength).getBytes(),
(entry.profileHandle == null) ? plasmaURL.dummyHash.getBytes() : entry.profileHandle.getBytes(),
serverCodings.enhancedCoder.encodeBase64Long((long) entry.hrefCount, plasmaURL.urlForkFactorLength).getBytes(),
serverCodings.enhancedCoder.encodeBase64Long((long) entry.imageCount, plasmaURL.urlForkFactorLength).getBytes(),
(entry.anchorName == null) ? "-".getBytes() : entry.anchorName.getBytes()
});
}
@ -126,9 +122,8 @@ public class plasmaSwitchboardQueue {
}
public Entry newEntry(URL url, String referrer, Date ifModifiedSince, boolean requestWithCookie,
String initiator, int depth, String profilehandle,
int hrefCount, int imageCount, String anchorName) {
return new Entry(url, referrer, ifModifiedSince, requestWithCookie, initiator, depth, profilehandle, hrefCount, imageCount, anchorName);
String initiator, int depth, String profilehandle, String anchorName) {
return new Entry(url, referrer, ifModifiedSince, requestWithCookie, initiator, depth, profilehandle, anchorName);
}
public class Entry {
@ -139,8 +134,6 @@ public class plasmaSwitchboardQueue {
private String initiator; // yacySeedDB.commonHashLength
private int depth; // plasmaURL.urlCrawlDepthLength
private String profileHandle; // plasmaURL.urlCrawlProfileHandleLength
private int hrefCount; // plasmaURL.urlForkFactorLength
private int imageCount; // plasmaURL.urlForkFactorLength
private String anchorName; // plasmaURL.urlDescrLength
// computed values
@ -149,8 +142,7 @@ public class plasmaSwitchboardQueue {
private URL referrerURL;
public Entry(URL url, String referrer, Date ifModifiedSince, boolean requestWithCookie,
String initiator, int depth, String profileHandle,
int hrefCount, int imageCount, String anchorName) {
String initiator, int depth, String profileHandle, String anchorName) {
this.url = url;
this.referrerHash = referrer;
this.ifModifiedSince = ifModifiedSince;
@ -158,8 +150,6 @@ public class plasmaSwitchboardQueue {
this.initiator = initiator;
this.depth = depth;
this.profileHandle = profileHandle;
this.hrefCount = hrefCount;
this.imageCount = imageCount;
this.anchorName = anchorName;
this.profileEntry = null;
@ -181,9 +171,7 @@ public class plasmaSwitchboardQueue {
this.initiator = new String(row[4]);
this.depth = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(row[5]));
this.profileHandle = new String(row[6]);
this.hrefCount = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(row[7]));
this.imageCount = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(row[8]));
this.anchorName = new String(row[9]);
this.anchorName = new String(row[7]);
this.profileEntry = null;
this.responseHeader = null;
@ -248,14 +236,6 @@ public class plasmaSwitchboardQueue {
}
return referrerURL;
}
public int forkFactor() {
return hrefCount;
}
public int images() {
return imageCount;
}
public String anchorName() {
return anchorName;

Loading…
Cancel
Save