-removed superfluous crawl cache

-refactoring of crawler classes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6221 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 8103ccec4c
commit ca72ed7526

@ -34,7 +34,7 @@ import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import de.anomic.crawler.HTTPLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.data.listManager;
import de.anomic.http.httpClient;
import de.anomic.http.httpHeader;

@ -36,7 +36,7 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import de.anomic.crawler.HTTPLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.data.listManager;
import de.anomic.data.translator;
import de.anomic.http.httpClient;

@ -86,7 +86,7 @@ public class IndexCreateIndexingQueue_p {
yacySeed initiator;
boolean dark;
if ((sb.crawler.indexingStack.size() == 0) && (sb.crawler.indexingStack.getActiveQueueSize() == 0)) {
if ((sb.crawler.indexingStack.size() == 0) && (sb.getActiveQueueSize() == 0)) {
prop.put("indexing-queue", "0"); //is empty
} else {
prop.put("indexing-queue", "1"); // there are entries in the queue or in process
@ -98,7 +98,6 @@ public class IndexCreateIndexingQueue_p {
// getting all entries that are currently in process
final ArrayList<IndexingStack.QueueEntry> entryList = new ArrayList<IndexingStack.QueueEntry>();
entryList.addAll(sb.crawler.indexingStack.getActiveQueueEntries());
final int inProcessCount = entryList.size();
// getting all enqueued entries

@ -24,7 +24,7 @@
// javac -classpath .:../classes IndexCreate_p.java
// if the shell's current path is HTROOT
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.retrieval.Request;
import de.anomic.http.httpRequestHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
@ -44,7 +44,7 @@ public class IndexCreateLoaderQueue_p {
} else {
prop.put("loader-set", "1");
boolean dark = true;
final CrawlEntry[] w = sb.crawlQueues.activeWorkerEntries();
final Request[] w = sb.crawlQueues.activeWorkerEntries();
yacySeed initiator;
int count = 0;
for (int i = 0; i < w.length; i++) {

@ -29,9 +29,9 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.Locale;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.NoticedURL;
import de.anomic.crawler.retrieval.Request;
import de.anomic.http.httpRequestHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
@ -85,9 +85,9 @@ public class IndexCreateWWWGlobalQueue_p {
prop.put("crawler-queue", "0");
} else {
prop.put("crawler-queue", "1");
final ArrayList<CrawlEntry> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_LIMIT, showLimit);
final ArrayList<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_LIMIT, showLimit);
CrawlEntry urle;
Request urle;
boolean dark = true;
yacySeed initiator;
String profileHandle;

@ -33,12 +33,12 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.NoticedURL;
import de.anomic.http.httpRequestHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.crawler.retrieval.Request;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacySeed;
@ -109,8 +109,8 @@ public class IndexCreateWWWLocalQueue_p {
}
} else {
// iterating through the list of URLs
final Iterator<CrawlEntry> iter = sb.crawlQueues.noticeURL.iterator(NoticedURL.STACK_TYPE_CORE);
CrawlEntry entry;
final Iterator<Request> iter = sb.crawlQueues.noticeURL.iterator(NoticedURL.STACK_TYPE_CORE);
Request entry;
while (iter.hasNext()) {
if ((entry = iter.next()) == null) continue;
String value = null;
@ -154,9 +154,9 @@ public class IndexCreateWWWLocalQueue_p {
prop.put("crawler-queue", "0");
} else {
prop.put("crawler-queue", "1");
final ArrayList<CrawlEntry> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_CORE, (int) (showLimit * 1.20));
final ArrayList<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_CORE, (int) (showLimit * 1.20));
CrawlEntry urle;
Request urle;
boolean dark = true;
yacySeed initiator;
String profileHandle;

@ -29,9 +29,9 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.Locale;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.NoticedURL;
import de.anomic.crawler.retrieval.Request;
import de.anomic.http.httpRequestHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
@ -85,9 +85,9 @@ public class IndexCreateWWWRemoteQueue_p {
prop.put("crawler-queue", "0");
} else {
prop.put("crawler-queue", "1");
final ArrayList<CrawlEntry> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_REMOTE, showLimit);
final ArrayList<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_REMOTE, showLimit);
CrawlEntry urle;
Request urle;
boolean dark = true;
yacySeed initiator;
String profileHandle;

@ -35,7 +35,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import de.anomic.crawler.HTTPLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.http.httpClient;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.util.DateFormatter;

@ -34,8 +34,8 @@ import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.util.Date;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Request;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRequestHeader;
import de.anomic.plasma.plasmaSwitchboard;
@ -159,7 +159,7 @@ public class QuickCrawlLink_p {
// stack URL
String reasonString = null;
reasonString = sb.crawlStacker.stackCrawl(new CrawlEntry(
reasonString = sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash,
crawlingStartURL,
null,

@ -286,7 +286,7 @@ public class Status {
prop.putNum("connectionsMax", httpd.getMaxSessionCount());
// Queue information
final int indexingJobCount = sb.getThread("80_indexing").getJobCount() + sb.crawler.indexingStack.getActiveQueueSize();
final int indexingJobCount = sb.getThread("80_indexing").getJobCount() + sb.getActiveQueueSize();
final int indexingMaxCount = (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30);
final int indexingPercent = (indexingMaxCount==0)?0:indexingJobCount*100/indexingMaxCount;
prop.putNum("indexingQueueSize", indexingJobCount);

@ -34,6 +34,7 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import de.anomic.crawler.retrieval.Response;
import de.anomic.document.Condenser;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
@ -42,7 +43,6 @@ import de.anomic.document.parser.html.ImageEntry;
import de.anomic.http.httpClient;
import de.anomic.http.httpRequestHeader;
import de.anomic.http.httpResponseHeader;
import de.anomic.http.httpDocument;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.plasma.plasmaHTCache;
@ -152,7 +152,7 @@ public class ViewFile {
// if the resource body was not cached we try to load it from web
if (resource == null) {
httpDocument entry = null;
Response entry = null;
try {
entry = sb.crawlQueues.loadResourceFromWeb(url, true, false);
} catch (final Exception e) {

@ -36,10 +36,10 @@ import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.SitemapImporter;
import de.anomic.crawler.ZURL;
import de.anomic.crawler.retrieval.Request;
import de.anomic.data.bookmarksDB;
import de.anomic.data.listManager;
import de.anomic.document.parser.html.ContentScraper;
@ -225,7 +225,7 @@ public class WatchCrawler_p {
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
final String reasonString = sb.crawlStacker.stackCrawl(new CrawlEntry(
final String reasonString = sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash,
url,
null,
@ -279,7 +279,7 @@ public class WatchCrawler_p {
prop.putHTML("info_reasonString", reasonString);
final ZURL.Entry ee = sb.crawlQueues.errorURL.newEntry(
new CrawlEntry(
new Request(
sb.peers.mySeed().hash,
crawlingStartURL,
"",
@ -364,7 +364,7 @@ public class WatchCrawler_p {
if (nexturl == null) continue;
// enqueuing the url for crawling
sb.crawlStacker.enqueueEntry(new CrawlEntry(
sb.crawlStacker.enqueueEntry(new Request(
sb.peers.mySeed().hash,
nexturl,
"",

@ -4,9 +4,9 @@ import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.IndexingStack;
import de.anomic.crawler.NoticedURL;
import de.anomic.crawler.retrieval.Request;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.util.kelondroException;
import de.anomic.plasma.plasmaSwitchboard;
@ -39,11 +39,11 @@ public class queues_p {
yacySeed initiator;
//indexing queue
prop.putNum("indexingSize", sb.getThread(plasmaSwitchboardConstants.INDEXER).getJobCount() + sb.crawler.indexingStack.getActiveQueueSize());
prop.putNum("indexingSize", sb.getThread(plasmaSwitchboardConstants.INDEXER).getJobCount() + sb.getActiveQueueSize());
prop.putNum("indexingMax", (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30));
prop.putNum("urlpublictextSize", sb.indexSegment.urlMetadata().size());
prop.putNum("rwipublictextSize", sb.indexSegment.termIndex().sizesMax());
if ((sb.crawler.indexingStack.size() == 0) && (sb.crawler.indexingStack.getActiveQueueSize() == 0)) {
if ((sb.crawler.indexingStack.size() == 0) && (sb.getActiveQueueSize() == 0)) {
prop.put("list", "0"); //is empty
} else {
IndexingStack.QueueEntry pcentry;
@ -52,7 +52,6 @@ public class queues_p {
// getting all entries that are currently in process
final ArrayList<IndexingStack.QueueEntry> entryList = new ArrayList<IndexingStack.QueueEntry>();
entryList.addAll(sb.crawler.indexingStack.getActiveQueueEntries());
final int inProcessCount = entryList.size();
// getting all enqueued entries
@ -97,7 +96,7 @@ public class queues_p {
if (sb.crawlQueues.size() == 0) {
prop.put("list-loader", "0");
} else {
final CrawlEntry[] w = sb.crawlQueues.activeWorkerEntries();
final Request[] w = sb.crawlQueues.activeWorkerEntries();
int count = 0;
for (int i = 0; i < w.length; i++) {
if (w[i] == null) continue;
@ -138,10 +137,10 @@ public class queues_p {
}
public static final void addNTable(final plasmaSwitchboard sb, final serverObjects prop, final String tableName, final ArrayList<CrawlEntry> crawlerList) {
public static final void addNTable(final plasmaSwitchboard sb, final serverObjects prop, final String tableName, final ArrayList<Request> crawlerList) {
int showNum = 0;
CrawlEntry urle;
Request urle;
yacySeed initiator;
for (int i = 0; i < crawlerList.size(); i++) {
urle = crawlerList.get(i);

@ -3,7 +3,7 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Set;
import de.anomic.crawler.HTTPLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRequestHeader;

@ -31,7 +31,7 @@ import java.util.Date;
import java.util.Iterator;
import de.anomic.content.RSSMessage;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.retrieval.Request;
import de.anomic.document.parser.xml.RSSFeed;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.util.DateFormatter;
@ -77,7 +77,7 @@ public class rct_p {
if (urlRejectReason == null) {
// stack url
if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
sb.crawlStacker.enqueueEntry(new CrawlEntry(
sb.crawlStacker.enqueueEntry(new Request(
peerhash,
url,
(referrer == null) ? null : referrer.hash(),

@ -38,7 +38,7 @@ import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import de.anomic.crawler.HTTPLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.data.AbstractBlacklist;
import de.anomic.data.listManager;
import de.anomic.data.list.ListAccumulator;

@ -27,8 +27,8 @@
import java.io.IOException;
import java.util.Date;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.NoticedURL;
import de.anomic.crawler.retrieval.Request;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.DateFormatter;
@ -62,7 +62,7 @@ public class urls {
long maxTime = Math.min(20000, Math.max(1000, post.getInt("time", 10000)));
long timeout = System.currentTimeMillis() + maxTime;
int c = 0;
CrawlEntry entry;
Request entry;
yacyURL referrer;
while ((maxCount > 0) &&
(System.currentTimeMillis() < timeout) &&

@ -33,6 +33,7 @@ import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import de.anomic.crawler.retrieval.Request;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.index.ObjectIndex;
import de.anomic.kelondro.order.CloneableIterator;
@ -68,7 +69,7 @@ public class Balancer {
if (!(cachePath.exists())) cachePath.mkdir(); // make the path
cacheStacksPath.mkdirs();
File f = new File(cacheStacksPath, stackname + indexSuffix);
urlFileIndex = new Table(f, CrawlEntry.rowdef, (fullram) ? Table.tailCacheUsageAuto : Table.tailCacheDenyUsage, EcoFSBufferSize, 0);
urlFileIndex = new Table(f, Request.rowdef, (fullram) ? Table.tailCacheUsageAuto : Table.tailCacheDenyUsage, EcoFSBufferSize, 0);
profileErrors = 0;
lastDomainStackFill = 0;
Log.logInfo("Balancer", "opened balancer file with " + urlFileIndex.size() + " entries from " + f.toString());
@ -108,12 +109,12 @@ public class Balancer {
}
}
public CrawlEntry get(final String urlhash) throws IOException {
public Request get(final String urlhash) throws IOException {
assert urlhash != null;
if (urlFileIndex == null) return null; // case occurs during shutdown
final Row.Entry entry = urlFileIndex.get(urlhash.getBytes());
if (entry == null) return null;
return new CrawlEntry(entry);
return new Request(entry);
}
public int removeAllByProfileHandle(final String profileHandle, final long timeout) throws IOException {
@ -125,11 +126,11 @@ public class Balancer {
final Iterator<Row.Entry> i = urlFileIndex.rows();
final HashSet<String> urlHashes = new HashSet<String>();
Row.Entry rowEntry;
CrawlEntry crawlEntry;
Request crawlEntry;
final long terminate = (timeout > 0) ? System.currentTimeMillis() + timeout : Long.MAX_VALUE;
while (i.hasNext() && (System.currentTimeMillis() < terminate)) {
rowEntry = i.next();
crawlEntry = new CrawlEntry(rowEntry);
crawlEntry = new Request(rowEntry);
if (crawlEntry.profileHandle().equals(profileHandle)) {
urlHashes.add(crawlEntry.url().hash());
}
@ -215,7 +216,7 @@ public class Balancer {
return false;
}
public void push(final CrawlEntry entry) throws IOException {
public void push(final Request entry) throws IOException {
assert entry != null;
String hash = entry.url().hash();
synchronized (this) {
@ -289,7 +290,7 @@ public class Balancer {
* @return a url in a CrawlEntry object
* @throws IOException
*/
public CrawlEntry pop(final boolean delay, final CrawlProfile profile) throws IOException {
public Request pop(final boolean delay, final CrawlProfile profile) throws IOException {
// returns a crawl entry from the stack and ensures minimum delta times
filltop(delay, -600000, false);
@ -304,7 +305,7 @@ public class Balancer {
filltop(delay, 0, true);
long sleeptime = 0;
CrawlEntry crawlEntry = null;
Request crawlEntry = null;
while (this.urlFileIndex.size() > 0) {
// first simply take one of the entries in the top list, that should be one without any delay
String result = nextFromDelayed();
@ -323,7 +324,7 @@ public class Balancer {
}
//assert urlFileIndex.size() + 1 == s : "urlFileIndex.size() = " + urlFileIndex.size() + ", s = " + s + ", result = " + result;
crawlEntry = new CrawlEntry(rowEntry);
crawlEntry = new Request(rowEntry);
//Log.logInfo("Balancer", "fetched next url: " + crawlEntry.url().toNormalform(true, false));
// at this point we must check if the crawlEntry has relevancy because the crawl profile still exists
@ -433,15 +434,15 @@ public class Balancer {
}
}
public ArrayList<CrawlEntry> top(int count) {
public ArrayList<Request> top(int count) {
count = Math.min(count, top.size());
ArrayList<CrawlEntry> cel = new ArrayList<CrawlEntry>();
ArrayList<Request> cel = new ArrayList<Request>();
if (count == 0) return cel;
for (String n: top) {
try {
Row.Entry rowEntry = urlFileIndex.get(n.getBytes());
if (rowEntry == null) continue;
final CrawlEntry crawlEntry = new CrawlEntry(rowEntry);
final Request crawlEntry = new Request(rowEntry);
cel.add(crawlEntry);
count--;
if (count <= 0) break;
@ -451,11 +452,11 @@ public class Balancer {
return cel;
}
public Iterator<CrawlEntry> iterator() throws IOException {
public Iterator<Request> iterator() throws IOException {
return new EntryIterator();
}
private class EntryIterator implements Iterator<CrawlEntry> {
private class EntryIterator implements Iterator<Request> {
private Iterator<Row.Entry> rowIterator;
@ -467,10 +468,10 @@ public class Balancer {
return (rowIterator == null) ? false : rowIterator.hasNext();
}
public CrawlEntry next() {
public Request next() {
final Row.Entry entry = rowIterator.next();
try {
return (entry == null) ? null : new CrawlEntry(entry);
return (entry == null) ? null : new Request(entry);
} catch (final IOException e) {
rowIterator = null;
return null;

@ -37,9 +37,11 @@ import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import de.anomic.content.RSSMessage;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.crawler.retrieval.Response;
import de.anomic.document.parser.xml.RSSFeed;
import de.anomic.http.httpClient;
import de.anomic.http.httpDocument;
import de.anomic.kelondro.table.SplitTable;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.FileUtils;
@ -57,7 +59,7 @@ public class CrawlQueues {
protected plasmaSwitchboard sb;
protected Log log;
protected Map<Integer, crawlWorker> workers; // mapping from url hash to Worker thread object
protected ProtocolLoader loader;
protected LoaderDispatcher loader;
private final ArrayList<String> remoteCrawlProviderHashes;
public NoticedURL noticeURL;
@ -67,7 +69,7 @@ public class CrawlQueues {
this.sb = sb;
this.log = new Log("CRAWLER");
this.workers = new ConcurrentHashMap<Integer, crawlWorker>();
this.loader = new ProtocolLoader(sb, log);
this.loader = new LoaderDispatcher(sb, log);
this.remoteCrawlProviderHashes = new ArrayList<String>();
// start crawling management
@ -106,7 +108,7 @@ public class CrawlQueues {
public yacyURL getURL(final String urlhash) {
assert urlhash != null;
if (urlhash == null || urlhash.length() == 0) return null;
final CrawlEntry ne = noticeURL.get(urlhash);
final Request ne = noticeURL.get(urlhash);
if (ne != null) return ne.url();
ZURL.Entry ee = delegatedURL.getEntry(urlhash);
if (ee != null) return ee.url();
@ -164,9 +166,9 @@ public class CrawlQueues {
delegatedURL.close();
}
public CrawlEntry[] activeWorkerEntries() {
public Request[] activeWorkerEntries() {
synchronized (workers) {
final CrawlEntry[] e = new CrawlEntry[workers.size()];
final Request[] e = new Request[workers.size()];
int i = 0;
for (final crawlWorker w: workers.values()) e[i++] = w.entry;
return e;
@ -203,7 +205,7 @@ public class CrawlQueues {
if(isPaused(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL)) return false;
// do a local crawl
CrawlEntry urlEntry = null;
Request urlEntry = null;
while (urlEntry == null && noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) > 0) {
final String stats = "LOCALCRAWL[" + noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE) + "]";
try {
@ -234,7 +236,7 @@ public class CrawlQueues {
* @param stats String for log prefixing
* @return
*/
private void generateCrawl(CrawlEntry urlEntry, final String stats, final String profileHandle) {
private void generateCrawl(Request urlEntry, final String stats, final String profileHandle) {
final CrawlProfile.entry profile = sb.crawler.profilesActiveCrawls.getEntry(profileHandle);
if (profile != null) {
@ -443,7 +445,7 @@ public class CrawlQueues {
if (urlRejectReason == null) {
// stack url
if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
sb.crawlStacker.enqueueEntry(new CrawlEntry(
sb.crawlStacker.enqueueEntry(new Request(
hash,
url,
(referrer == null) ? null : referrer.hash(),
@ -491,7 +493,7 @@ public class CrawlQueues {
final String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_OVERHANG) + ", "
+ noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE) + "]";
try {
final CrawlEntry urlEntry = noticeURL.pop(NoticedURL.STACK_TYPE_REMOTE, true, sb.crawler.profilesActiveCrawls);
final Request urlEntry = noticeURL.pop(NoticedURL.STACK_TYPE_REMOTE, true, sb.crawler.profilesActiveCrawls);
final String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " +
@ -505,13 +507,13 @@ public class CrawlQueues {
}
}
public httpDocument loadResourceFromWeb(
public Response loadResourceFromWeb(
final yacyURL url,
final boolean forText,
final boolean global
) throws IOException {
final CrawlEntry centry = new CrawlEntry(
final Request centry = new Request(
sb.peers.mySeed().hash,
url,
"",
@ -539,11 +541,11 @@ public class CrawlQueues {
protected final class crawlWorker extends Thread {
protected CrawlEntry entry;
protected Request entry;
private final Integer code;
private long start;
public crawlWorker(final CrawlEntry entry) {
public crawlWorker(final Request entry) {
this.start = System.currentTimeMillis();
this.entry = entry;
this.entry.setStatus("worker-initialized", serverProcessorJob.STATUS_INITIATED);

@ -31,6 +31,7 @@ package de.anomic.crawler;
import java.net.UnknownHostException;
import java.util.Date;
import de.anomic.crawler.retrieval.Request;
import de.anomic.data.Blacklist;
import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
@ -45,7 +46,7 @@ public final class CrawlStacker {
private Log log = new Log("STACKCRAWL");
private serverProcessor<CrawlEntry> fastQueue, slowQueue;
private serverProcessor<Request> fastQueue, slowQueue;
private long dnsHit, dnsMiss;
private CrawlQueues nextQueue;
private CrawlSwitchboard crawler;
@ -71,8 +72,8 @@ public final class CrawlStacker {
this.acceptLocalURLs = acceptLocalURLs;
this.acceptGlobalURLs = acceptGlobalURLs;
this.fastQueue = new serverProcessor<CrawlEntry>("CrawlStackerFast", "This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter)", new String[]{"Balancer"}, this, "job", 10000, null, 2);
this.slowQueue = new serverProcessor<CrawlEntry>("CrawlStackerSlow", "This is like CrawlStackerFast, but does additionaly a DNS lookup. The CrawlStackerFast does not need this because it can use the DNS cache.", new String[]{"Balancer"}, this, "job", 1000, null, 5);
this.fastQueue = new serverProcessor<Request>("CrawlStackerFast", "This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter)", new String[]{"Balancer"}, this, "job", 10000, null, 2);
this.slowQueue = new serverProcessor<Request>("CrawlStackerSlow", "This is like CrawlStackerFast, but does additionaly a DNS lookup. The CrawlStackerFast does not need this because it can use the DNS cache.", new String[]{"Balancer"}, this, "job", 1000, null, 5);
this.log.logInfo("STACKCRAWL thread initialized.");
}
@ -125,7 +126,7 @@ public final class CrawlStacker {
}
*/
public CrawlEntry job(CrawlEntry entry) {
public Request job(Request entry) {
// this is the method that is called by the busy thread from outside
if (entry == null) return null;
@ -145,7 +146,7 @@ public final class CrawlStacker {
return null;
}
public void enqueueEntry(final CrawlEntry entry) {
public void enqueueEntry(final Request entry) {
// DEBUG
if (log.isFinest()) log.logFinest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + entry.initiator() + ", name=" + entry.name() + ", load=" + entry.loaddate() + ", depth=" + entry.depth());
@ -167,7 +168,7 @@ public final class CrawlStacker {
}
}
public String stackCrawl(final CrawlEntry entry) {
public String stackCrawl(final Request entry) {
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");

@ -29,10 +29,8 @@ package de.anomic.crawler;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.concurrent.ConcurrentHashMap;
import de.anomic.document.Parser;
import de.anomic.http.httpHeader;
@ -60,7 +58,6 @@ public class IndexingStack {
protected final CrawlProfile profiles;
protected final RecordStack sbQueueStack;
protected final yacySeedDB peers;
protected final ConcurrentHashMap<String, QueueEntry> queueInProcess;
public IndexingStack(
final yacySeedDB peers,
@ -69,7 +66,6 @@ public class IndexingStack {
final CrawlProfile profiles) {
this.profiles = profiles;
this.peers = peers;
this.queueInProcess = new ConcurrentHashMap<String, QueueEntry>();
this.sbQueueStack = RecordStack.open(new File(queuesRoot, sbQueueStackName), rowdef);
}
@ -199,27 +195,24 @@ public class IndexingStack {
}
}
public QueueEntry newEntry(final yacyURL url, final String referrer, final Date ifModifiedSince, final boolean requestWithCookie,
final String initiator, final int depth, final String profilehandle, final String anchorName) {
return new QueueEntry(url, referrer, ifModifiedSince, requestWithCookie, initiator, depth, profilehandle, anchorName);
}
public void store(final QueueEntry entry) {
queueInProcess.put(entry.url().hash(), entry);
}
public QueueEntry getActiveEntry(final String urlhash) {
// show one entry from the queue
return this.queueInProcess.get(urlhash);
}
public int getActiveQueueSize() {
return this.queueInProcess.size();
}
public Collection<QueueEntry> getActiveQueueEntries() {
// todo: check dead entries?
return this.queueInProcess.values();
public QueueEntry newEntry(
final yacyURL url,
final String referrer,
final Date ifModifiedSince,
final boolean requestWithCookie,
final String initiator,
final int depth,
final String profilehandle,
final String anchorName) {
return new QueueEntry(
url,
referrer,
ifModifiedSince,
requestWithCookie,
initiator,
depth,
profilehandle,
anchorName);
}
public static final int QUEUE_STATE_FRESH = 0;
@ -229,6 +222,9 @@ public class IndexingStack {
public static final int QUEUE_STATE_INDEXSTORAGE = 4;
public static final int QUEUE_STATE_FINISHED = 5;
/**
* A HarvestResponse is a object that refers to a loaded entity.
*/
public class QueueEntry {
yacyURL url; // plasmaURL.urlStringLength
String referrerHash; // plasmaURL.urlHashLength
@ -245,8 +241,15 @@ public class IndexingStack {
private httpResponseHeader responseHeader;
private yacyURL referrerURL;
public QueueEntry(final yacyURL url, final String referrer, final Date ifModifiedSince, final boolean requestWithCookie,
final String initiator, final int depth, final String profileHandle, final String anchorName) {
public QueueEntry(
final yacyURL url,
final String referrer,
final Date ifModifiedSince,
final boolean requestWithCookie,
final String initiator,
final int depth,
final String profileHandle,
final String anchorName) {
this.url = url;
this.referrerHash = referrer;
this.ifModifiedSince = ifModifiedSince;
@ -310,14 +313,6 @@ public class IndexingStack {
this.status = newStatus;
}
public void close() {
queueInProcess.remove(this.url.hash());
}
protected void finalize() {
this.close();
}
public yacyURL url() {
return url;
}

@ -23,7 +23,7 @@
package de.anomic.crawler;
import de.anomic.http.httpDocument;
import de.anomic.crawler.retrieval.Response;
import de.anomic.server.serverSemaphore;
import de.anomic.yacy.yacyURL;
@ -41,7 +41,7 @@ public final class LoaderMessage {
public final boolean keepInMemory;
private serverSemaphore resultSync = null;
private httpDocument result;
private Response result;
private String errorMessage;
// loadParallel(URL url, String referer, String initiator, int depth, plasmaCrawlProfile.entry profile) {
@ -80,7 +80,7 @@ public final class LoaderMessage {
return this.errorMessage;
}
public void setResult(final httpDocument theResult) {
public void setResult(final Response theResult) {
// store the result
this.result = theResult;
@ -88,8 +88,8 @@ public final class LoaderMessage {
this.resultSync.V();
}
public httpDocument waitForResult() throws InterruptedException {
httpDocument theResult = null;
public Response waitForResult() throws InterruptedException {
Response theResult = null;
this.resultSync.P();
/* =====> CRITICAL SECTION <======== */

@ -8,6 +8,7 @@ import java.util.Iterator;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.crawler.retrieval.Request;
public class NoticeURLImporter extends AbstractImporter implements Importer {
@ -129,11 +130,11 @@ public class NoticeURLImporter extends AbstractImporter implements Importer {
}
// getting an iterator and loop through the URL entries
final Iterator<CrawlEntry> entryIter = (stackTypes[stackType] == -1) ? this.importNurlDB.iterator(stackType) : null;
final Iterator<Request> entryIter = (stackTypes[stackType] == -1) ? this.importNurlDB.iterator(stackType) : null;
while (true) {
String nextHash = null;
CrawlEntry nextEntry = null;
Request nextEntry = null;
try {
if (stackTypes[stackType] != -1) {

@ -30,6 +30,7 @@ import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import de.anomic.crawler.retrieval.Request;
import de.anomic.yacy.logging.Log;
public class NoticedURL {
@ -134,7 +135,7 @@ public class NoticedURL {
remoteStack.has(urlhash);
}
public void push(final int stackType, final CrawlEntry entry) {
public void push(final int stackType, final Request entry) {
try {
switch (stackType) {
case STACK_TYPE_CORE:
@ -151,8 +152,8 @@ public class NoticedURL {
} catch (final IOException er) {}
}
public CrawlEntry get(final String urlhash) {
CrawlEntry entry = null;
public Request get(final String urlhash) {
Request entry = null;
try {if ((entry = coreStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
try {if ((entry = limitStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
try {if ((entry = remoteStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
@ -182,7 +183,7 @@ public class NoticedURL {
return removed;
}
public ArrayList<CrawlEntry> top(final int stackType, final int count) {
public ArrayList<Request> top(final int stackType, final int count) {
switch (stackType) {
case STACK_TYPE_CORE: return top(coreStack, count);
case STACK_TYPE_LIMIT: return top(limitStack, count);
@ -191,7 +192,7 @@ public class NoticedURL {
}
}
public CrawlEntry pop(final int stackType, final boolean delay, CrawlProfile profile) throws IOException {
public Request pop(final int stackType, final boolean delay, CrawlProfile profile) throws IOException {
switch (stackType) {
case STACK_TYPE_CORE: return pop(coreStack, delay, profile);
case STACK_TYPE_LIMIT: return pop(limitStack, delay, profile);
@ -202,7 +203,7 @@ public class NoticedURL {
public void shift(final int fromStack, final int toStack, CrawlProfile profile) {
try {
final CrawlEntry entry = pop(fromStack, false, profile);
final Request entry = pop(fromStack, false, profile);
if (entry != null) push(toStack, entry);
} catch (final IOException e) {
return;
@ -219,10 +220,10 @@ public class NoticedURL {
}
}
private CrawlEntry pop(final Balancer balancer, final boolean delay, CrawlProfile profile) throws IOException {
private Request pop(final Balancer balancer, final boolean delay, CrawlProfile profile) throws IOException {
// this is a filo - pop
int s;
CrawlEntry entry;
Request entry;
int errors = 0;
synchronized (balancer) {
while ((s = balancer.size()) > 0) {
@ -241,15 +242,15 @@ public class NoticedURL {
return null;
}
private ArrayList<CrawlEntry> top(final Balancer balancer, int count) {
private ArrayList<Request> top(final Balancer balancer, int count) {
// this is a filo - top
if (count > balancer.size()) count = balancer.size();
ArrayList<CrawlEntry> list;
ArrayList<Request> list;
list = balancer.top(count);
return list;
}
public Iterator<CrawlEntry> iterator(final int stackType) {
public Iterator<Request> iterator(final int stackType) {
// returns an iterator of plasmaCrawlBalancerEntry Objects
try {switch (stackType) {
case STACK_TYPE_CORE: return coreStack.iterator();
@ -257,7 +258,7 @@ public class NoticedURL {
case STACK_TYPE_REMOTE: return remoteStack.iterator();
default: return null;
}} catch (final IOException e) {
return new HashSet<CrawlEntry>().iterator();
return new HashSet<Request>().iterator();
}
}

@ -40,6 +40,7 @@ import java.util.LinkedList;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.http.httpClient;
import de.anomic.http.httpHeader;
import de.anomic.http.httpResponse;

@ -32,6 +32,7 @@ import java.util.Date;
import java.util.Iterator;
import java.util.LinkedList;
import de.anomic.crawler.retrieval.Request;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.index.RowSet;
import de.anomic.kelondro.index.ObjectIndex;
@ -53,7 +54,7 @@ public class ZURL {
"Cardinal workdate-8 {b256}, " + // the time when the url was last time tried to load
"Cardinal workcount-4 {b256}, " + // number of load retries
"String anycause-132, " + // string describing load failure
"byte[] entry-" + CrawlEntry.rowdef.objectsize, // extra space
"byte[] entry-" + Request.rowdef.objectsize, // extra space
Base64Order.enhancedCoder
);
@ -96,7 +97,7 @@ public class ZURL {
}
public synchronized Entry newEntry(
final CrawlEntry bentry,
final Request bentry,
final String executor,
final Date workdate,
final int workcount,
@ -160,7 +161,7 @@ public class ZURL {
public class Entry {
CrawlEntry bentry; // the balancer entry
Request bentry; // the balancer entry
private final String executor; // the crawling executor
private final Date workdate; // the time when the url was last time tried to load
private final int workcount; // number of tryings
@ -168,7 +169,7 @@ public class ZURL {
private boolean stored;
public Entry(
final CrawlEntry bentry,
final Request bentry,
final String executor,
final Date workdate,
final int workcount,
@ -190,7 +191,7 @@ public class ZURL {
this.workdate = new Date(entry.getColLong(2));
this.workcount = (int) entry.getColLong(3);
this.anycause = entry.getColString(4, "UTF-8");
this.bentry = new CrawlEntry(CrawlEntry.rowdef.newEntry(entry.getColBytes(5)));
this.bentry = new Request(Request.rowdef.newEntry(entry.getColBytes(5)));
assert ((new String(entry.getColBytes(0))).equals(bentry.url().hash()));
this.stored = true;
return;

@ -25,18 +25,18 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.crawler;
package de.anomic.crawler.retrieval;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Date;
import de.anomic.crawler.Latency;
import de.anomic.document.Parser;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRequestHeader;
import de.anomic.http.httpResponseHeader;
import de.anomic.http.httpDocument;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.net.ftpc;
import de.anomic.plasma.plasmaHTCache;
@ -56,14 +56,14 @@ public class FTPLoader {
maxFileSize = (int) sb.getConfigLong("crawler.ftp.maxFileSize", -1l);
}
protected httpDocument createCacheEntry(final CrawlEntry entry, final String mimeType, final Date fileDate) {
protected Response createCacheEntry(final Request entry, final String mimeType, final Date fileDate) {
if (entry == null) return null;
httpRequestHeader requestHeader = new httpRequestHeader();
if (entry.referrerhash() != null) requestHeader.put(httpRequestHeader.REFERER, sb.getURL(entry.referrerhash()).toNormalform(true, false));
httpResponseHeader responseHeader = new httpResponseHeader();
responseHeader.put(httpHeader.LAST_MODIFIED, DateFormatter.formatRFC1123(fileDate));
responseHeader.put(httpHeader.CONTENT_TYPE, mimeType);
httpDocument metadata = new httpDocument(
Response metadata = new Response(
entry.depth(), entry.url(), entry.name(), "OK",
requestHeader, responseHeader,
entry.initiator(), sb.crawler.profilesActiveCrawls.getEntry(entry.profileHandle()));
@ -77,14 +77,14 @@ public class FTPLoader {
* @param entry
* @return
*/
public httpDocument load(final CrawlEntry entry) throws IOException {
public Response load(final Request entry) throws IOException {
long start = System.currentTimeMillis();
final yacyURL entryUrl = entry.url();
final String fullPath = getPath(entryUrl);
// the return value
httpDocument htCache = null;
Response htCache = null;
// determine filename and path
String file, path;
@ -215,7 +215,7 @@ public class FTPLoader {
* @return
* @throws Exception
*/
private httpDocument getFile(final ftpc ftpClient, final CrawlEntry entry) throws Exception {
private Response getFile(final ftpc ftpClient, final Request entry) throws Exception {
// determine the mimetype of the resource
final yacyURL entryUrl = entry.url();
final String mimeType = Parser.mimeOf(entryUrl);
@ -223,7 +223,7 @@ public class FTPLoader {
// if the mimetype and file extension is supported we start to download
// the file
httpDocument htCache = null;
Response htCache = null;
String supportError = Parser.supports(entryUrl, mimeType);
if (supportError != null) {
// reject file
@ -271,7 +271,7 @@ public class FTPLoader {
* @param cacheFile
* @return
*/
private byte[] generateDirlist(final ftpc ftpClient, final CrawlEntry entry, final String path) {
private byte[] generateDirlist(final ftpc ftpClient, final Request entry, final String path) {
// getting the dirlist
final yacyURL entryUrl = entry.url();

@ -23,11 +23,12 @@
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.crawler;
package de.anomic.crawler.retrieval;
import java.io.IOException;
import java.util.Date;
import de.anomic.crawler.Latency;
import de.anomic.data.Blacklist;
import de.anomic.document.Parser;
import de.anomic.http.httpClient;
@ -35,7 +36,6 @@ import de.anomic.http.httpHeader;
import de.anomic.http.httpResponse;
import de.anomic.http.httpRequestHeader;
import de.anomic.http.httpResponseHeader;
import de.anomic.http.httpDocument;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.yacy.yacyURL;
@ -83,8 +83,8 @@ public final class HTTPLoader {
* @param responseStatus Status-Code SPACE Reason-Phrase
* @return
*/
protected httpDocument createCacheEntry(final CrawlEntry entry, final Date requestDate, final httpRequestHeader requestHeader, final httpResponseHeader responseHeader, final String responseStatus) {
httpDocument metadata = new httpDocument(
protected Response createCacheEntry(final Request entry, final Date requestDate, final httpRequestHeader requestHeader, final httpResponseHeader responseHeader, final String responseStatus) {
Response metadata = new Response(
entry.depth(),
entry.url(),
entry.name(),
@ -98,14 +98,14 @@ public final class HTTPLoader {
return metadata;
}
public httpDocument load(final CrawlEntry entry) throws IOException {
public Response load(final Request entry) throws IOException {
long start = System.currentTimeMillis();
httpDocument doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT);
Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT);
Latency.update(entry.url().hash().substring(6), entry.url().getHost(), System.currentTimeMillis() - start);
return doc;
}
private httpDocument load(final CrawlEntry entry, final int retryCount) throws IOException {
private Response load(final Request entry, final int retryCount) throws IOException {
if (retryCount < 0) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "redirection counter exceeded").store();
@ -134,7 +134,7 @@ public final class HTTPLoader {
}
// take a file from the net
httpDocument htCache = null;
Response htCache = null;
final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", DEFAULT_MAXFILESIZE);
//try {
// create a request header

@ -1,4 +1,4 @@
// plasmaProtocolLoader.java
// LoaderDispatcher.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 24.10.2007 on http://yacy.net
//
@ -24,7 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.crawler;
package de.anomic.crawler.retrieval;
import java.io.IOException;
import java.util.Arrays;
@ -33,13 +33,12 @@ import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import de.anomic.http.httpDocument;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore;
import de.anomic.server.serverProcessorJob;
import de.anomic.yacy.logging.Log;
public final class ProtocolLoader {
public final class LoaderDispatcher {
private static final long minDelay = 250; // milliseconds; 4 accesses per second
private static final ConcurrentHashMap<String, Long> accessTime = new ConcurrentHashMap<String, Long>(); // to protect targets from DDoS
@ -50,7 +49,7 @@ public final class ProtocolLoader {
private final HTTPLoader httpLoader;
private final FTPLoader ftpLoader;
public ProtocolLoader(final plasmaSwitchboard sb, final Log log) {
public LoaderDispatcher(final plasmaSwitchboard sb, final Log log) {
this.sb = sb;
this.log = log;
this.supportedProtocols = new HashSet<String>(Arrays.asList(new String[]{"http","https","ftp"}));
@ -70,7 +69,7 @@ public final class ProtocolLoader {
return (HashSet<String>) this.supportedProtocols.clone();
}
public httpDocument load(final CrawlEntry entry) throws IOException {
public Response load(final Request entry) throws IOException {
// getting the protocol of the next URL
final String protocol = entry.url().getProtocol();
final String host = entry.url().getHost();
@ -111,10 +110,10 @@ public final class ProtocolLoader {
}
}
public String process(final CrawlEntry entry) {
public String process(final Request entry) {
// load a resource, store it to htcache and push queue entry to switchboard queue
// returns null if everything went fine, a fail reason string if a problem occurred
httpDocument h;
Response h;
try {
entry.setStatus("loading", serverProcessorJob.STATUS_RUNNING);
h = load(entry);

@ -24,7 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.crawler;
package de.anomic.crawler.retrieval;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
@ -38,7 +38,7 @@ import de.anomic.server.serverProcessorJob;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL;
public class CrawlEntry extends serverProcessorJob {
public class Request extends serverProcessorJob {
// row definition for balancer-related NURL-entries
public final static Row rowdef = new Row(
@ -80,6 +80,9 @@ public class CrawlEntry extends serverProcessorJob {
/**
* A HarvestRequest Entry is a object that is created to provide
* all information to load a specific resource.
*
* @param initiator the hash of the initiator peer
* @param url the {@link URL} to crawl
* @param referrer the hash of the referrer URL
@ -90,7 +93,7 @@ public class CrawlEntry extends serverProcessorJob {
* @param anchors number of anchors of the parent
* @param forkfactor sum of anchors of all ancestors
*/
public CrawlEntry(
public Request(
final String initiator,
final yacyURL url,
final String referrerhash,
@ -126,7 +129,7 @@ public class CrawlEntry extends serverProcessorJob {
this.status = serverProcessorJob.STATUS_INITIATED;
}
public CrawlEntry(final Row.Entry entry) throws IOException {
public Request(final Row.Entry entry) throws IOException {
assert (entry != null);
insertEntry(entry);
}

@ -24,16 +24,19 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.http;
package de.anomic.crawler.retrieval;
import java.util.Date;
import de.anomic.crawler.CrawlProfile;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRequestHeader;
import de.anomic.http.httpResponseHeader;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.yacy.yacyURL;
public class httpDocument {
public class Response {
// doctypes:
public static final char DT_PDFPS = 'p';
@ -51,7 +54,7 @@ public class httpDocument {
// the class objects
private final int depth; // the depth of pre-fetching
private final String responseStatus;
private byte[] cacheArray; // or the cache as byte-array
private byte[] cacheArray; //
private final yacyURL url;
private final String name; // the name of the link, read as anchor from an <a>-tag
private final CrawlProfile.entry profile;
@ -130,7 +133,7 @@ public class httpDocument {
return doctype;
}
public httpDocument(
public Response(
final int depth,
final yacyURL url,
final String name,

@ -38,9 +38,9 @@ import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.HTTPLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.http.httpClient;
import de.anomic.http.httpHeader;
import de.anomic.http.httpResponse;
@ -272,7 +272,7 @@ public class SitemapParser extends DefaultHandler {
}
// URL needs to crawled
this.sb.crawlStacker.enqueueEntry(new CrawlEntry(
this.sb.crawlStacker.enqueueEntry(new Request(
this.sb.peers.mySeed().hash,
url,
null, // this.siteMapURL.toString(),

@ -61,8 +61,8 @@ import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Request;
import de.anomic.document.Word;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.document.parser.html.TransformerWriter;
@ -262,7 +262,7 @@ public class bookmarksDB {
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
sb.crawlStacker.enqueueEntry(new CrawlEntry(
sb.crawlStacker.enqueueEntry(new Request(
sb.peers.mySeed().hash,
crawlingStartURL,
null,

@ -44,7 +44,7 @@ import java.util.Properties;
import javax.swing.event.EventListenerList;
import de.anomic.crawler.HTTPLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.document.parser.htmlParser;
import de.anomic.http.httpClient;
import de.anomic.http.httpHeader;

@ -44,7 +44,7 @@ import com.catcode.odf.ODFMetaFileAnalyzer;
import com.catcode.odf.OpenDocumentMetadata;
import com.catcode.odf.OpenDocumentTextInputStream;
import de.anomic.crawler.HTTPLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;

@ -37,7 +37,7 @@ import java.util.Set;
import com.jguild.jrpm.io.RPMFile;
import com.jguild.jrpm.io.datatype.DataTypeIf;
import de.anomic.crawler.HTTPLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;

@ -38,7 +38,7 @@ import java.util.Iterator;
import java.util.LinkedList;
import java.util.Set;
import de.anomic.crawler.HTTPLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;

@ -71,7 +71,8 @@ import java.util.logging.LogManager;
import java.util.logging.Logger;
import java.util.zip.GZIPOutputStream;
import de.anomic.crawler.HTTPLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Response;
import de.anomic.data.Blacklist;
import de.anomic.document.Parser;
import de.anomic.document.parser.html.ContentTransformer;
@ -377,7 +378,7 @@ public final class httpdProxyHandler {
if (theLogger.isFinest()) theLogger.logFinest(reqID + " page not in cache: fulfill request from web");
fulfillRequestFromWeb(conProp, url, requestHeader, cachedResponseHeader, countedRespond);
} else {
final httpDocument cacheEntry = new httpDocument(
final Response cacheEntry = new Response(
0, // crawling depth
url, // url
"", // name of the url is unknown
@ -491,7 +492,7 @@ public final class httpdProxyHandler {
}
// reserver cache entry
final httpDocument cacheEntry = new httpDocument(
final Response cacheEntry = new Response(
0,
url,
"",

@ -28,7 +28,7 @@ package de.anomic.kelondro.text;
import java.util.Date;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.retrieval.Request;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.text.Reference;
@ -82,7 +82,7 @@ public interface Metadata {
public String toString(final String snippet);
public CrawlEntry toBalancerEntry(final String initiatorHash);
public Request toBalancerEntry(final String initiatorHash);
public String toString();

@ -34,12 +34,12 @@ import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
import de.anomic.crawler.retrieval.Response;
import de.anomic.data.Blacklist;
import de.anomic.document.Condenser;
import de.anomic.document.Word;
import de.anomic.document.Document;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.http.httpDocument;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
@ -269,7 +269,7 @@ public final class Segment {
new byte[0], // md5
(int) sourcesize, // size
condenser.RESULT_NUMB_WORDS, // word count
httpDocument.docType(document.dc_format()), // doctype
Response.docType(document.dc_format()), // doctype
condenser.RESULT_FLAGS, // flags
language, // language
document.inboundLinks(), // inbound links
@ -292,7 +292,7 @@ public final class Segment {
document, // document content
condenser, // document condenser
language, // document language
httpDocument.docType(document.dc_format()), // document type
Response.docType(document.dc_format()), // document type
document.inboundLinks(), // inbound links
document.outboundLinks() // outbound links
);

@ -33,7 +33,7 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.Properties;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.retrieval.Request;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.Bitfield;
@ -468,8 +468,8 @@ public class URLMetadataRow implements Metadata {
//return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
}
public CrawlEntry toBalancerEntry(final String initiatorHash) {
return new CrawlEntry(
public Request toBalancerEntry(final String initiatorHash) {
return new Request(
initiatorHash,
metadata().url(),
referrerHash(),

@ -41,9 +41,9 @@ import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import de.anomic.crawler.retrieval.Response;
import de.anomic.document.Classification;
import de.anomic.http.httpResponseHeader;
import de.anomic.http.httpDocument;
import de.anomic.kelondro.blob.ArrayStack;
import de.anomic.kelondro.blob.Compressor;
import de.anomic.kelondro.blob.Heap;
@ -204,7 +204,7 @@ public final class plasmaHTCache {
public static void storeMetadata(
final httpResponseHeader responseHeader,
httpDocument metadata
Response metadata
) {
if (responseHeader != null) try {
// store the response header into the header database

@ -115,16 +115,13 @@ import java.util.regex.Pattern;
import de.anomic.content.DCEntry;
import de.anomic.content.RSSMessage;
import de.anomic.content.file.SurrogateReader;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlQueues;
import de.anomic.crawler.CrawlStacker;
import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.crawler.HTTPLoader;
import de.anomic.crawler.ImporterManager;
import de.anomic.crawler.IndexingStack;
import de.anomic.crawler.NoticedURL;
import de.anomic.crawler.ProtocolLoader;
import de.anomic.crawler.ResourceObserver;
import de.anomic.crawler.ResultImages;
import de.anomic.crawler.ResultURLs;
@ -132,6 +129,10 @@ import de.anomic.crawler.RobotsTxt;
import de.anomic.crawler.ZURL;
import de.anomic.crawler.CrawlProfile.entry;
import de.anomic.crawler.IndexingStack.QueueEntry;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.crawler.retrieval.Response;
import de.anomic.data.Blacklist;
import de.anomic.data.URLLicense;
import de.anomic.data.blogBoard;
@ -155,7 +156,6 @@ import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpRequestHeader;
import de.anomic.http.httpResponseHeader;
import de.anomic.http.httpd;
import de.anomic.http.httpDocument;
import de.anomic.http.httpdRobotsTxtConfig;
import de.anomic.kelondro.order.Digest;
import de.anomic.kelondro.order.NaturalOrder;
@ -669,7 +669,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
log.logConfig("Finished Switchboard Initialization");
}
public int getActiveQueueSize() {
return
this.indexingDocumentProcessor.queueSize() +
this.indexingCondensementProcessor.queueSize() +
this.indexingAnalysisProcessor.queueSize() +
this.indexingStorageProcessor.queueSize();
}
public void overwriteNetworkDefinition() {
// load network configuration into settings
@ -1062,7 +1069,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
* {@link CrawlProfile Crawl Profiles} are saved independantly from the queues themselves
* and therefore have to be cleaned up from time to time. This method only performs the clean-up
* if - and only if - the {@link IndexingStack switchboard},
* {@link ProtocolLoader loader} and {@link plasmaCrawlNURL local crawl} queues are all empty.
* {@link LoaderDispatcher loader} and {@link plasmaCrawlNURL local crawl} queues are all empty.
* <p>
* Then it iterates through all existing {@link CrawlProfile crawl profiles} and removes
* all profiles which are not hardcoded.
@ -1088,7 +1095,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
return this.crawler.cleanProfiles();
}
public boolean htEntryStoreProcess(final httpDocument entry) {
public boolean htEntryStoreProcess(final Response entry) {
if (entry == null) return false;
@ -1361,7 +1368,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
return false;
}
if (queueEntry.profile() == null) {
queueEntry.close();
if (this.log.isFine()) log.logFine("deQueue: profile is null");
return false;
}
@ -1390,7 +1396,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// put document into the concurrent processing queue
if (log.isFinest()) log.logFinest("deQueue: passing entry to indexing queue");
this.crawler.indexingStack.store(queueEntry);
this.indexingDocumentProcessor.enQueue(new indexingQueueEntry(queueEntry, null, null));
return true;
} catch (final InterruptedException e) {
@ -1667,7 +1672,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
document = null;
}
if (document == null) {
in.queueEntry.close();
return null;
}
return new indexingQueueEntry(in.queueEntry, document, null);
@ -1728,7 +1732,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
String u = nextUrl.toNormalform(true, true);
if (!(u.startsWith("http") || u.startsWith("ftp"))) continue;
// enqueue the hyperlink into the pre-notice-url db
crawlStacker.enqueueEntry(new CrawlEntry(
crawlStacker.enqueueEntry(new Request(
entry.initiator(),
nextUrl,
entry.url().hash(),
@ -1769,7 +1773,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
return new indexingQueueEntry(in.queueEntry, in.document, condenser);
} catch (final UnsupportedEncodingException e) {
in.queueEntry.close();
return null;
}
}
@ -1784,7 +1787,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
in.queueEntry.updateStatus(IndexingStack.QUEUE_STATE_INDEXSTORAGE);
storeDocumentIndex(in.queueEntry, in.document, in.condenser);
in.queueEntry.updateStatus(IndexingStack.QUEUE_STATE_FINISHED);
in.queueEntry.close();
}
private void storeDocumentIndex(final IndexingStack.QueueEntry queueEntry, final Document document, final Condenser condenser) {
@ -2135,7 +2137,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
) {
assert initiator != null;
// create a new errorURL DB entry
final CrawlEntry bentry = new CrawlEntry(
final Request bentry = new Request(
initiator,
url,
referrerHash,

@ -38,6 +38,7 @@ import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.anomic.crawler.retrieval.Response;
import de.anomic.document.Condenser;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
@ -47,7 +48,6 @@ import de.anomic.document.parser.html.CharacterCoding;
import de.anomic.document.parser.html.ImageEntry;
import de.anomic.http.httpClient;
import de.anomic.http.httpResponseHeader;
import de.anomic.http.httpDocument;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.util.ScoreCluster;
@ -354,7 +354,7 @@ public class SnippetCache {
// if not found try to download it
// download resource using the crawler and keep resource in memory if possible
final httpDocument entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, true, reindexing);
final Response entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, true, reindexing);
// getting resource metadata (e.g. the http headers for http resources)
if (entry != null) {
@ -466,7 +466,7 @@ public class SnippetCache {
// if not found try to download it
// download resource using the crawler and keep resource in memory if possible
final httpDocument entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, forText, global);
final Response entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, forText, global);
// getting resource metadata (e.g. the http headers for http resources)
if (entry != null) {
@ -905,7 +905,7 @@ public class SnippetCache {
// if the content is not available in cache try to download it from web
// try to download the resource using a crawler
final httpDocument entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, forText, reindexing);
final Response entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, forText, reindexing);
if (entry == null) return null; // not found in web
// read resource body (if it is there)

@ -24,7 +24,7 @@ package de.anomic.tools;
import java.util.ArrayList;
import java.util.Hashtable;
import de.anomic.crawler.HTTPLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.http.httpClient;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRemoteProxyConfig;

@ -58,8 +58,8 @@ import java.util.TreeMap;
import org.apache.commons.httpclient.methods.multipart.ByteArrayPartSource;
import org.apache.commons.httpclient.methods.multipart.Part;
import de.anomic.crawler.HTTPLoader;
import de.anomic.crawler.ResultURLs;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.data.Blacklist;
import de.anomic.document.Word;
import de.anomic.document.parser.xml.RSSFeed;

@ -44,7 +44,7 @@ import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
import de.anomic.crawler.HTTPLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.http.httpClient;
import de.anomic.http.httpHeader;

@ -39,7 +39,7 @@ import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import de.anomic.crawler.HTTPLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.http.httpClient;
import de.anomic.http.httpHeader;
import de.anomic.http.httpResponse;

@ -35,7 +35,7 @@ import java.net.MalformedURLException;
import javax.imageio.ImageIO;
import de.anomic.http.httpDocument;
import de.anomic.crawler.retrieval.Response;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.yacy.yacyURL;
@ -80,7 +80,7 @@ public class ymageOSM {
InputStream tileStream = plasmaHTCache.getResourceContentStream(tileURL);
if (tileStream == null) {
// download resource using the crawler and keep resource in memory if possible
httpDocument entry = null;
Response entry = null;
try {
entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(tileURL, false, false);
} catch (IOException e) {

Loading…
Cancel
Save