fixed bugs/missing code regarding new crawl stack

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@384 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 9a7c4080d5
commit 419f8fb398

@ -3,7 +3,7 @@ javacSource=1.4
javacTarget=1.4 javacTarget=1.4
# Release Configuration # Release Configuration
releaseVersion=0.385 releaseVersion=0.386
releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
#releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz #releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr} releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}

@ -20,7 +20,6 @@ There are #[num]# entries in the indexing queue:<br>
<th class="small">Initiator</th> <th class="small">Initiator</th>
<th class="small">Depth</th> <th class="small">Depth</th>
<th class="small">Modified Date</th> <th class="small">Modified Date</th>
<th class="small">#HREF</th>
<td class="small">Anchor Name</th> <td class="small">Anchor Name</th>
<th class="small">URL</th> <th class="small">URL</th>
</tr> </tr>
@ -29,7 +28,6 @@ There are #[num]# entries in the indexing queue:<br>
<td width="60" class="small">#[initiator]#</td> <td width="60" class="small">#[initiator]#</td>
<td width="10" class="small">#[depth]#</td> <td width="10" class="small">#[depth]#</td>
<td width="80" class="small">#[modified]#</td> <td width="80" class="small">#[modified]#</td>
<td width="10" class="small">#[href]#</td>
<td width="180" class="small">#[anchor]#</td> <td width="180" class="small">#[anchor]#</td>
<td class="small"><a class="small" href="#[url]#">#[url]#</a></td> <td class="small"><a class="small" href="#[url]#">#[url]#</a></td>
</tr> </tr>

@ -100,7 +100,6 @@ public class IndexCreateIndexingQueue_p {
prop.put("indexing-queue_list_"+i+"_initiator", ((initiator == null) ? "proxy" : initiator.getName())); prop.put("indexing-queue_list_"+i+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("indexing-queue_list_"+i+"_depth", pcentry.depth()); prop.put("indexing-queue_list_"+i+"_depth", pcentry.depth());
prop.put("indexing-queue_list_"+i+"_modified", (pcentry.responseHeader() == null) ? "null" : daydate(pcentry.responseHeader().lastModified())); prop.put("indexing-queue_list_"+i+"_modified", (pcentry.responseHeader() == null) ? "null" : daydate(pcentry.responseHeader().lastModified()));
prop.put("indexing-queue_list_"+i+"_href", pcentry.forkFactor());
prop.put("indexing-queue_list_"+i+"_anchor", pcentry.anchorName()); prop.put("indexing-queue_list_"+i+"_anchor", pcentry.anchorName());
prop.put("indexing-queue_list_"+i+"_url", pcentry.normalizedURLString()); prop.put("indexing-queue_list_"+i+"_url", pcentry.normalizedURLString());
dark = !dark; dark = !dark;

@ -443,6 +443,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
requestDate, // init date requestDate, // init date
0, // crawling depth 0, // crawling depth
url, // url url, // url
"", // name of the url is unknown
requestHeader, // request headers requestHeader, // request headers
"200 OK", // request status "200 OK", // request status
cachedResponseHeader, // response headers cachedResponseHeader, // response headers
@ -486,7 +487,6 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
GZIPOutputStream gzippedOut = null; GZIPOutputStream gzippedOut = null;
httpChunkedOutputStream chunkedOut = null; httpChunkedOutputStream chunkedOut = null;
OutputStream hfos = null; OutputStream hfos = null;
htmlFilterContentScraper scraper = null;
httpc remote = null; httpc remote = null;
httpc.response res = null; httpc.response res = null;
@ -568,7 +568,8 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
plasmaHTCache.Entry cacheEntry = cacheManager.newEntry( plasmaHTCache.Entry cacheEntry = cacheManager.newEntry(
requestDate, requestDate,
0, 0,
url, url,
"",
requestHeader, requestHeader,
res.status, res.status,
res.responseHeader, res.responseHeader,
@ -576,33 +577,8 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
switchboard.defaultProxyProfile switchboard.defaultProxyProfile
); );
// handle file types // make output stream
if (((ext == null) || (!(plasmaParser.mediaExtContains(ext)))) && hfos = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond);
(plasmaParser.realtimeParsableMimeTypesContains(res.responseHeader.mime()))) {
// this is a file that is a possible candidate for parsing by the indexer
if (transformer.isIdentityTransformer()) {
this.theLogger.logDebug("create passthrough (parse candidate) for url " + url);
// no transformation, only passthrough
// this isng especially the case if the bluelist is empty
// in that case, the content is not scraped here but later
hfos = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond);
} else {
// make a scraper and transformer
this.theLogger.logDebug("create scraper for url " + url);
scraper = new htmlFilterContentScraper(url);
hfos = new htmlFilterOutputStream((gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond), scraper, transformer, (ext.length() == 0));
if (((htmlFilterOutputStream) hfos).binarySuspect()) {
scraper = null; // forget it, may be rubbish
this.theLogger.logDebug("Content of " + url + " is probably binary. deleted scraper.");
}
cacheEntry.scraper = scraper;
}
} else {
this.theLogger.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped");
scraper = null;
hfos = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond);
cacheEntry.scraper = scraper;
}
// handle incoming cookies // handle incoming cookies
handleIncomingCookies(res.responseHeader, host, ip); handleIncomingCookies(res.responseHeader, host, ip);

@ -132,15 +132,15 @@ public final class plasmaCrawlLoader extends Thread {
public void close() { public void close() {
try { try {
// setting the stop flag to true // setting the stop flag to true
this.stopped = true; this.stopped = true;
// interrupting the plasmaCrawlLoader // interrupting the plasmaCrawlLoader
this.interrupt(); this.interrupt();
// waiting for the thread to finish ... // waiting for the thread to finish ...
this.log.logInfo("Waiting for plasmaCrawlLoader shutdown ..."); this.log.logInfo("Waiting for plasmaCrawlLoader shutdown ...");
this.join(5000); this.join(5000);
} catch (Exception e) { } catch (Exception e) {
// we where interrupted while waiting for the crawlLoader Thread to finish // we where interrupted while waiting for the crawlLoader Thread to finish
} }
@ -186,6 +186,7 @@ public final class plasmaCrawlLoader extends Thread {
public void loadParallel( public void loadParallel(
URL url, URL url,
String name,
String referer, String referer,
String initiator, String initiator,
int depth, int depth,
@ -195,7 +196,7 @@ public final class plasmaCrawlLoader extends Thread {
int crawlingPriority = 5; int crawlingPriority = 5;
// creating a new crawler queue object // creating a new crawler queue object
plasmaCrawlLoaderMessage theMsg = new plasmaCrawlLoaderMessage(url, referer,initiator,depth,profile, crawlingPriority); plasmaCrawlLoaderMessage theMsg = new plasmaCrawlLoaderMessage(url, name, referer, initiator, depth, profile, crawlingPriority);
// adding the message to the queue // adding the message to the queue
try { try {
@ -287,8 +288,7 @@ class CrawlerMessageQueue {
} }
final class CrawlerPool extends GenericObjectPool final class CrawlerPool extends GenericObjectPool {
{
private final ThreadGroup theThreadGroup; private final ThreadGroup theThreadGroup;
public boolean isClosed = false; public boolean isClosed = false;

@ -47,6 +47,7 @@ import java.net.URL;
public final class plasmaCrawlLoaderMessage { public final class plasmaCrawlLoaderMessage {
public final int crawlingPriority; public final int crawlingPriority;
public final URL url; public final URL url;
public final String name;
public final String referer; public final String referer;
public final String initiator; public final String initiator;
public final int depth; public final int depth;
@ -54,13 +55,15 @@ public final class plasmaCrawlLoaderMessage {
// loadParallel(URL url, String referer, String initiator, int depth, plasmaCrawlProfile.entry profile) { // loadParallel(URL url, String referer, String initiator, int depth, plasmaCrawlProfile.entry profile) {
public plasmaCrawlLoaderMessage( public plasmaCrawlLoaderMessage(
URL url, URL url,
String name,
String referer, String referer,
String initiator, String initiator,
int depth, int depth,
plasmaCrawlProfile.entry profile, plasmaCrawlProfile.entry profile,
int crawlingPriority) { int crawlingPriority) {
this.url = url; this.url = url;
this.name = name;
this.referer = referer; this.referer = referer;
this.initiator = initiator; this.initiator = initiator;
this.depth = depth; this.depth = depth;

@ -71,6 +71,7 @@ public final class plasmaCrawlWorker extends Thread {
public plasmaCrawlLoaderMessage theMsg; public plasmaCrawlLoaderMessage theMsg;
private URL url; private URL url;
private String name;
private String referer; private String referer;
private String initiator; private String initiator;
private int depth; private int depth;
@ -125,6 +126,7 @@ public final class plasmaCrawlWorker extends Thread {
this.theMsg = theMsg; this.theMsg = theMsg;
this.url = theMsg.url; this.url = theMsg.url;
this.name = theMsg.name;
this.referer = theMsg.referer; this.referer = theMsg.referer;
this.initiator = theMsg.initiator; this.initiator = theMsg.initiator;
this.depth = theMsg.depth; this.depth = theMsg.depth;
@ -198,7 +200,7 @@ public final class plasmaCrawlWorker extends Thread {
public void execute() throws IOException { public void execute() throws IOException {
try { try {
this.setName(this.threadBaseName + "_" + this.url); this.setName(this.threadBaseName + "_" + this.url);
load(this.url, this.referer, this.initiator, this.depth, this.profile, load(this.url, this.name, this.referer, this.initiator, this.depth, this.profile,
this.socketTimeout, this.remoteProxyHost, this.remoteProxyPort, this.remoteProxyUse, this.socketTimeout, this.remoteProxyHost, this.remoteProxyPort, this.remoteProxyUse,
this.cacheManager, this.log); this.cacheManager, this.log);
@ -220,6 +222,7 @@ public final class plasmaCrawlWorker extends Thread {
public static void load( public static void load(
URL url, URL url,
String name,
String referer, String referer,
String initiator, String initiator,
int depth, int depth,
@ -232,6 +235,7 @@ public final class plasmaCrawlWorker extends Thread {
serverLog log serverLog log
) throws IOException { ) throws IOException {
load(url, load(url,
name,
referer, referer,
initiator, initiator,
depth, depth,
@ -248,7 +252,8 @@ public final class plasmaCrawlWorker extends Thread {
} }
private static void load( private static void load(
URL url, URL url,
String name,
String referer, String referer,
String initiator, String initiator,
int depth, int depth,
@ -300,7 +305,7 @@ public final class plasmaCrawlWorker extends Thread {
long contentLength = res.responseHeader.contentLength(); long contentLength = res.responseHeader.contentLength();
// reserve cache entry // reserve cache entry
plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, requestHeader, res.status, res.responseHeader, initiator, profile); plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, name, requestHeader, res.status, res.responseHeader, initiator, profile);
// request has been placed and result has been returned. work off response // request has been placed and result has been returned. work off response
File cacheFile = cacheManager.getCachePath(url); File cacheFile = cacheManager.getCachePath(url);
@ -355,6 +360,7 @@ public final class plasmaCrawlWorker extends Thread {
log.logInfo("Redirection detected ('" + res.status + "') for url " + url.toString() + log.logInfo("Redirection detected ('" + res.status + "') for url " + url.toString() +
"\nRedirecting request to: " + redirectionUrl); "\nRedirecting request to: " + redirectionUrl);
load(redirectionUrl, load(redirectionUrl,
name,
referer, referer,
initiator, initiator,
depth, depth,
@ -383,18 +389,19 @@ public final class plasmaCrawlWorker extends Thread {
log.logWarning("Problems detected while receiving gzip encoded content from '" + url.toString() + log.logWarning("Problems detected while receiving gzip encoded content from '" + url.toString() +
"'. Retrying request without using gzip content encoding."); "'. Retrying request without using gzip content encoding.");
load(url, load(url,
referer, name,
initiator, referer,
depth, initiator,
profile, depth,
socketTimeout, profile,
remoteProxyHost, socketTimeout,
remoteProxyPort, remoteProxyHost,
remoteProxyUse, remoteProxyPort,
cacheManager, remoteProxyUse,
log, cacheManager,
0, log,
false 0,
false
); );
} else { } else {
// this may happen if the targeted host does not exist or anything with the // this may happen if the targeted host does not exist or anything with the

@ -431,13 +431,12 @@ public final class plasmaHTCache {
(urlString.toLowerCase().indexOf(".exe") >= 0)); (urlString.toLowerCase().indexOf(".exe") >= 0));
} }
public Entry newEntry(Date initDate, int depth, URL url, public Entry newEntry(Date initDate, int depth, URL url, String name,
httpHeader requestHeader, httpHeader requestHeader,
String responseStatus, httpHeader responseHeader, String responseStatus, httpHeader responseHeader,
String initiator, String initiator,
plasmaCrawlProfile.entry profile) { plasmaCrawlProfile.entry profile) {
//System.out.println("NEW ENTRY: " + url.toString()); // DEBUG return new Entry(initDate, depth, url, name, requestHeader, responseStatus, responseHeader, initiator, profile);
return new Entry(initDate, depth, url, requestHeader, responseStatus, responseHeader, initiator, profile);
} }
public final class Entry { public final class Entry {
@ -451,6 +450,7 @@ public final class plasmaHTCache {
public File cacheFile; // the cache file public File cacheFile; // the cache file
public byte[] cacheArray; // or the cache as byte-array public byte[] cacheArray; // or the cache as byte-array
public URL url; public URL url;
public String name; // the name of the link, read as anchor from an <a>-tag
public String nomalizedURLHash; public String nomalizedURLHash;
public String nomalizedURLString; public String nomalizedURLString;
public int status; // cache load/hit/stale etc status public int status; // cache load/hit/stale etc status
@ -459,10 +459,9 @@ public final class plasmaHTCache {
public String language; public String language;
public plasmaCrawlProfile.entry profile; public plasmaCrawlProfile.entry profile;
private String initiator; private String initiator;
public htmlFilterContentScraper scraper;
public Entry(Date initDate, int depth, URL url, public Entry(Date initDate, int depth, URL url, String name,
httpHeader requestHeader, httpHeader requestHeader,
String responseStatus, httpHeader responseHeader, String responseStatus, httpHeader responseHeader,
String initiator, String initiator,
@ -476,6 +475,7 @@ public final class plasmaHTCache {
System.out.println("internal error at httpdProxyCache.Entry: " + e); System.out.println("internal error at httpdProxyCache.Entry: " + e);
System.exit(-1); System.exit(-1);
} }
this.name = name;
this.cacheFile = getCachePath(this.url); this.cacheFile = getCachePath(this.url);
this.nomalizedURLHash = plasmaCrawlLURL.urlHash(nomalizedURLString); this.nomalizedURLHash = plasmaCrawlLURL.urlHash(nomalizedURLString);
@ -510,9 +510,11 @@ public final class plasmaHTCache {
// to be defined later: // to be defined later:
this.cacheArray = null; this.cacheArray = null;
this.status = CACHE_UNFILLED; this.status = CACHE_UNFILLED;
this.scraper = null;
} }
public String name() {
return name;
}
public String initiator() { public String initiator() {
return initiator; return initiator;
} }

@ -354,7 +354,8 @@ public class plasmaSnippetCache {
private void loadResourceFromWeb(URL url, int socketTimeout) throws IOException { private void loadResourceFromWeb(URL url, int socketTimeout) throws IOException {
plasmaCrawlWorker.load( plasmaCrawlWorker.load(
url, url,
"",
null, null,
null, null,
0, 0,

@ -264,7 +264,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
this.parser = new plasmaParser(); this.parser = new plasmaParser();
// initialize switchboard queue // initialize switchboard queue
sbQueue = new plasmaSwitchboardQueue(this.cacheManager, urlPool.loadedURL, new File(plasmaPath, "switchboardQueue0.stack"), 10, profiles); sbQueue = new plasmaSwitchboardQueue(this.cacheManager, urlPool.loadedURL, new File(plasmaPath, "switchboardQueue1.stack"), 10, profiles);
// define an extension-blacklist // define an extension-blacklist
log.logSystem("Parser: Initializing Extension Mappings for Media/Parser"); log.logSystem("Parser: Initializing Extension Mappings for Media/Parser");
@ -453,8 +453,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
cacheManager.push(entry); cacheManager.push(entry);
} }
synchronized public boolean htEntryStoreProcess(plasmaHTCache.Entry entry) throws IOException { synchronized public boolean htEntryStoreProcess(plasmaHTCache.Entry entry) throws IOException {
if (entry == null) return false; if (entry == null) return false;
@ -480,9 +478,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
enQueue(sbQueue.newEntry(entry.url, plasmaURL.urlHash(entry.referrerURL()), enQueue(sbQueue.newEntry(entry.url, plasmaURL.urlHash(entry.referrerURL()),
entry.requestHeader.ifModifiedSince(), entry.requestHeader.containsKey(httpHeader.COOKIE), entry.requestHeader.ifModifiedSince(), entry.requestHeader.containsKey(httpHeader.COOKIE),
entry.initiator(), entry.depth, entry.profile.handle(), entry.initiator(), entry.depth, entry.profile.handle(),
(entry.scraper == null) ? 0 : entry.scraper.getAnchors().size(), entry.name()
(entry.scraper == null) ? 0 : entry.scraper.getImages().size(),
(entry.scraper == null) ? "" : entry.scraper.getHeadline()
)); ));
} else if (entry.status == plasmaHTCache.CACHE_PASSING) { } else if (entry.status == plasmaHTCache.CACHE_PASSING) {
// even if the file should not be stored in the cache, it can be used to be indexed // even if the file should not be stored in the cache, it can be used to be indexed
@ -492,9 +488,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
enQueue(sbQueue.newEntry(entry.url, plasmaURL.urlHash(entry.referrerURL()), enQueue(sbQueue.newEntry(entry.url, plasmaURL.urlHash(entry.referrerURL()),
entry.requestHeader.ifModifiedSince(), entry.requestHeader.containsKey(httpHeader.COOKIE), entry.requestHeader.ifModifiedSince(), entry.requestHeader.containsKey(httpHeader.COOKIE),
entry.initiator(), entry.depth, entry.profile.handle(), entry.initiator(), entry.depth, entry.profile.handle(),
(entry.scraper == null) ? 0 : entry.scraper.getAnchors().size(), entry.name()
(entry.scraper == null) ? 0 : entry.scraper.getImages().size(),
(entry.scraper == null) ? "" : entry.scraper.getHeadline()
)); ));
} }
@ -504,9 +498,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
case plasmaHTCache.CACHE_UNFILLED: case plasmaHTCache.CACHE_UNFILLED:
log.logInfo("CACHE UNFILLED: " + entry.cacheFile); break; log.logInfo("CACHE UNFILLED: " + entry.cacheFile); break;
case plasmaHTCache.CACHE_FILL: case plasmaHTCache.CACHE_FILL:
log.logInfo("CACHE FILL: " + entry.cacheFile + log.logInfo("CACHE FILL: " + entry.cacheFile + ((entry.cacheArray == null) ? "" : " (cacheArray is filled)"));
((entry.cacheArray == null) ? "" : " (cacheArray is filled)") +
((entry.scraper == null) ? "" : " (scraper is filled)"));
break; break;
case plasmaHTCache.CACHE_HIT: case plasmaHTCache.CACHE_HIT:
log.logInfo("CACHE HIT: " + entry.cacheFile); break; log.logInfo("CACHE HIT: " + entry.cacheFile); break;
@ -574,27 +566,16 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
profiles.close(); profiles.close();
parser.close(); parser.close();
cacheManager.close(); cacheManager.close();
sbQueue.close();
} catch (IOException e) {} } catch (IOException e) {}
log.logSystem("SWITCHBOARD SHUTDOWN TERMINATED"); log.logSystem("SWITCHBOARD SHUTDOWN TERMINATED");
} }
/*
public int totalSize() {
return processStack.size() + cacheLoader.size() + noticeURL.stackSize();
}
*/
public int queueSize() { public int queueSize() {
return sbQueue.size(); return sbQueue.size();
//return processStack.size() + cacheLoader.size() + noticeURL.stackSize(); //return processStack.size() + cacheLoader.size() + noticeURL.stackSize();
} }
/*
public int lUrlSize() {
return urlPool.loadedURL.size();
}
*/
public int cacheSizeMin() { public int cacheSizeMin() {
return wordIndex.size(); return wordIndex.size();
} }
@ -812,13 +793,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
// alternatively do a local crawl // alternatively do a local crawl
if (sbQueue.size() >= crawlSlots) { if (sbQueue.size() >= indexingSlots) {
log.logDebug("LimitCrawl: too many processes in queue, dismissed (" + log.logDebug("LimitCrawl: too many processes in indexing queue, dismissed (" +
"sbQueueSize=" + sbQueue.size() + ")"); "sbQueueSize=" + sbQueue.size() + ")");
return false; return false;
} }
if (cacheLoader.size() >= crawlSlots) { if (cacheLoader.size() >= crawlSlots) {
log.logDebug("LimitCrawl: too many loader in queue, dismissed (" + log.logDebug("LimitCrawl: too many processes in loader queue, dismissed (" +
"cacheLoader=" + cacheLoader.size() + ")"); "cacheLoader=" + cacheLoader.size() + ")");
return false; return false;
} }
@ -924,7 +905,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// parse content // parse content
plasmaParserDocument document = null; plasmaParserDocument document = null;
if ((plasmaParser.supportedFileExt(entry.url())) || if ((plasmaParser.supportedFileExt(entry.url())) ||
((entry.responseHeader() != null) && ((entry.responseHeader() != null) &&
(plasmaParser.supportedMimeTypesContains(entry.responseHeader().mime())))) { (plasmaParser.supportedMimeTypesContains(entry.responseHeader().mime())))) {
@ -944,8 +924,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
return; return;
} }
Date loadDate = entry.responseHeader().lastModified(); Date loadDate = null;
if (loadDate == null) loadDate = entry.responseHeader().date(); if (entry.responseHeader() != null) {
loadDate = entry.responseHeader().lastModified();
if (loadDate == null) loadDate = entry.responseHeader().date();
}
if (loadDate == null) loadDate = new Date(); if (loadDate == null) loadDate = new Date();
// put anchors on crawl stack // put anchors on crawl stack
@ -1055,7 +1038,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
// explicit delete/free resources // explicit delete/free resources
if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) cacheManager.deleteFile(entry.url());
document = null; entry = null; document = null; entry = null;
} catch (IOException e) { } catch (IOException e) {
log.logError("ERROR in plasmaSwitchboard.process(): " + e.toString()); log.logError("ERROR in plasmaSwitchboard.process(): " + e.toString());
} }
@ -1166,7 +1152,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logInfo(stats + ": urlEntry=null"); log.logInfo(stats + ": urlEntry=null");
return; return;
} }
cacheLoader.loadParallel(urlEntry.url(), urlEntry.referrerHash(), urlEntry.initiator(), urlEntry.depth(), profile); cacheLoader.loadParallel(urlEntry.url(), urlEntry.name(), urlEntry.referrerHash(), urlEntry.initiator(), urlEntry.depth(), profile);
log.logInfo(stats + ": enqueued for load " + urlEntry.url()); log.logInfo(stats + ": enqueued for load " + urlEntry.url());
return; return;
} }

@ -76,8 +76,6 @@ public class plasmaSwitchboardQueue {
yacySeedDB.commonHashLength, yacySeedDB.commonHashLength,
plasmaURL.urlCrawlDepthLength, plasmaURL.urlCrawlDepthLength,
plasmaURL.urlCrawlProfileHandleLength, plasmaURL.urlCrawlProfileHandleLength,
plasmaURL.urlForkFactorLength,
plasmaURL.urlForkFactorLength,
plasmaURL.urlDescrLength plasmaURL.urlDescrLength
}); });
@ -96,8 +94,6 @@ public class plasmaSwitchboardQueue {
(entry.initiator == null) ? plasmaURL.dummyHash.getBytes() : entry.initiator.getBytes(), (entry.initiator == null) ? plasmaURL.dummyHash.getBytes() : entry.initiator.getBytes(),
serverCodings.enhancedCoder.encodeBase64Long((long) entry.depth, plasmaURL.urlCrawlDepthLength).getBytes(), serverCodings.enhancedCoder.encodeBase64Long((long) entry.depth, plasmaURL.urlCrawlDepthLength).getBytes(),
(entry.profileHandle == null) ? plasmaURL.dummyHash.getBytes() : entry.profileHandle.getBytes(), (entry.profileHandle == null) ? plasmaURL.dummyHash.getBytes() : entry.profileHandle.getBytes(),
serverCodings.enhancedCoder.encodeBase64Long((long) entry.hrefCount, plasmaURL.urlForkFactorLength).getBytes(),
serverCodings.enhancedCoder.encodeBase64Long((long) entry.imageCount, plasmaURL.urlForkFactorLength).getBytes(),
(entry.anchorName == null) ? "-".getBytes() : entry.anchorName.getBytes() (entry.anchorName == null) ? "-".getBytes() : entry.anchorName.getBytes()
}); });
} }
@ -126,9 +122,8 @@ public class plasmaSwitchboardQueue {
} }
public Entry newEntry(URL url, String referrer, Date ifModifiedSince, boolean requestWithCookie, public Entry newEntry(URL url, String referrer, Date ifModifiedSince, boolean requestWithCookie,
String initiator, int depth, String profilehandle, String initiator, int depth, String profilehandle, String anchorName) {
int hrefCount, int imageCount, String anchorName) { return new Entry(url, referrer, ifModifiedSince, requestWithCookie, initiator, depth, profilehandle, anchorName);
return new Entry(url, referrer, ifModifiedSince, requestWithCookie, initiator, depth, profilehandle, hrefCount, imageCount, anchorName);
} }
public class Entry { public class Entry {
@ -139,8 +134,6 @@ public class plasmaSwitchboardQueue {
private String initiator; // yacySeedDB.commonHashLength private String initiator; // yacySeedDB.commonHashLength
private int depth; // plasmaURL.urlCrawlDepthLength private int depth; // plasmaURL.urlCrawlDepthLength
private String profileHandle; // plasmaURL.urlCrawlProfileHandleLength private String profileHandle; // plasmaURL.urlCrawlProfileHandleLength
private int hrefCount; // plasmaURL.urlForkFactorLength
private int imageCount; // plasmaURL.urlForkFactorLength
private String anchorName; // plasmaURL.urlDescrLength private String anchorName; // plasmaURL.urlDescrLength
// computed values // computed values
@ -149,8 +142,7 @@ public class plasmaSwitchboardQueue {
private URL referrerURL; private URL referrerURL;
public Entry(URL url, String referrer, Date ifModifiedSince, boolean requestWithCookie, public Entry(URL url, String referrer, Date ifModifiedSince, boolean requestWithCookie,
String initiator, int depth, String profileHandle, String initiator, int depth, String profileHandle, String anchorName) {
int hrefCount, int imageCount, String anchorName) {
this.url = url; this.url = url;
this.referrerHash = referrer; this.referrerHash = referrer;
this.ifModifiedSince = ifModifiedSince; this.ifModifiedSince = ifModifiedSince;
@ -158,8 +150,6 @@ public class plasmaSwitchboardQueue {
this.initiator = initiator; this.initiator = initiator;
this.depth = depth; this.depth = depth;
this.profileHandle = profileHandle; this.profileHandle = profileHandle;
this.hrefCount = hrefCount;
this.imageCount = imageCount;
this.anchorName = anchorName; this.anchorName = anchorName;
this.profileEntry = null; this.profileEntry = null;
@ -181,9 +171,7 @@ public class plasmaSwitchboardQueue {
this.initiator = new String(row[4]); this.initiator = new String(row[4]);
this.depth = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(row[5])); this.depth = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(row[5]));
this.profileHandle = new String(row[6]); this.profileHandle = new String(row[6]);
this.hrefCount = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(row[7])); this.anchorName = new String(row[7]);
this.imageCount = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(row[8]));
this.anchorName = new String(row[9]);
this.profileEntry = null; this.profileEntry = null;
this.responseHeader = null; this.responseHeader = null;
@ -248,14 +236,6 @@ public class plasmaSwitchboardQueue {
} }
return referrerURL; return referrerURL;
} }
public int forkFactor() {
return hrefCount;
}
public int images() {
return imageCount;
}
public String anchorName() { public String anchorName() {
return anchorName; return anchorName;

Loading…
Cancel
Save