- some redesign in UI menu structure to make room for new 'Content Integration' main menu containing import servlets for Wikimedia Dumps, phpbb3 forum imports and OAI-PMH imports

- extended the OAI-PMH test applet and integrated it into the menu. Does still not import OAI-PMH records, but shows that it is able to read and parse this data
- some redesign in ZURL storage: refactoring of access methods, better concurrency, less synchronization
- added a limitation to the LURL metadata database table cache to 20 million entries: this cache was until now not limited and only limited by the available RAM which may have caused a memory-leak-like behavior.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6440 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 8a1046feaa
commit a0e891c63d

@ -6,7 +6,7 @@
</head>
<body id="ConfigSkins">
#%env/templates/header.template%#
#%env/templates/submenuIntegration.template%#
#%env/templates/submenuPortalIntegration.template%#
<h2>Integration of a Search Field for Live Search</h2>
<p>
A 'Live-Search' input field that reacts as search-as-you-type in a pop-up window can easily be integrated in any web page.

@ -6,7 +6,7 @@
</head>
<body id="IndexCreate">
#%env/templates/header.template%#
#%env/templates/submenuIntegration.template%#
#%env/templates/submenuPortalIntegration.template%#
<h2>Integration in phpBB3</h2>
<p>
It is possible to insert forum pages into the YaCy index using a databse import of forum postings.

@ -6,7 +6,7 @@
</head>
<body id="ConfigSkins">
#%env/templates/header.template%#
#%env/templates/submenuIntegration.template%#
#%env/templates/submenuPortalIntegration.template%#
<h2>Integration of a Search Portal</h2>
<p>
If you like to integrate YaCy as portal for your web pages, you may want to change icons and messages on the search page.

@ -6,7 +6,7 @@
</head>
<body id="ConfigSkins">
#%env/templates/header.template%#
#%env/templates/submenuIntegration.template%#
#%env/templates/submenuPortalIntegration.template%#
<h2>Integration of a Search Box</h2>
<p>
We give information how to integrate a search box on any web page that

@ -6,7 +6,7 @@
</head>
<body id="IndexCreate">
#%env/templates/header.template%#
#%env/templates/submenuIntegration.template%#
#%env/templates/submenuPortalIntegration.template%#
<h2>Integration in MediaWiki</h2>
<p>
It is possible to insert wiki pages into the YaCy index using a web crawl on that pages.

@ -6,7 +6,7 @@
</head>
<body id="ContentIntegrationPHPBB3">
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
#%env/templates/submenuContentIntegration.template%#
<h2>Content Integration: Retrieval from phpBB3 Databases</h2>
<p>
It is possible to extract texts directly from mySQL and postgreSQL databases.

@ -69,11 +69,9 @@ public class IndexCreateParserErrors_p {
dark = true;
DigestURI url;
String initiatorHash, executorHash;
ZURL.Entry entry;
yacySeed initiatorSeed, executorSeed;
int j=0;
for (int i = sb.crawlQueues.errorURL.stackSize() - 1; i >= (sb.crawlQueues.errorURL.stackSize() - showRejectedCount); i--) {
entry = sb.crawlQueues.errorURL.top(i);
for (ZURL.Entry entry: sb.crawlQueues.errorURL) {
if (entry == null) continue;
url = entry.url();
if (url == null) continue;

@ -7,7 +7,7 @@
</head>
<body id="IndexImportOAIPMH">
#%env/templates/header.template%#
#%env/templates/submenuIntegration.template%#
#%env/templates/submenuContentIntegration.template%#
<h2>OAI-PMH Import</h2>
#(import)#
@ -15,8 +15,8 @@
<form action="IndexImportOAIPMH_p.html" method="get">
<fieldset>
<legend>OAI-PMH Import: set a OAI-PMH URL</legend>
<input name="oaipmhurl" type="text" value="" size="80" />
<input name="submit" type="submit" value="Import from a OAI-PMH source" />
<input name="oaipmhurl" type="text" value="http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc" size="100" />
<input name="submit" type="submit" value="Import OAI-PMH source" />
</fieldset>
</form>
::

@ -22,7 +22,6 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.File;
import java.net.MalformedURLException;
import net.yacy.document.importer.OAIPMHImporter;
@ -55,16 +54,16 @@ public class IndexImportOAIPMH_p {
if (post == null) {
prop.put("import_status", 0);
} else {
if (post.containsKey("file")) {
if (post.containsKey("oaipmhurl")) {
String oaipmhurl = post.get("oaipmhurl");
DigestURI url = null;
try {
url = new DigestURI(oaipmhurl, null);
OAIPMHImporter.job = new OAIPMHImporter(sb.loader, url);
OAIPMHImporter.job.start();
prop.put("import", 1);
prop.put("import", 0);
prop.put("import_thread", "started");
prop.put("import_dump", OAIPMHImporter.job.source());
prop.put("import_source", OAIPMHImporter.job.source());
prop.put("import_count", 0);
prop.put("import_speed", 0);
prop.put("import_runningHours", 0);

@ -7,7 +7,7 @@
</head>
<body id="IndexImportWikimedia">
#%env/templates/header.template%#
#%env/templates/submenuIntegration.template%#
#%env/templates/submenuContentIntegration.template%#
<h2>Wikimedia Dump Import</h2>
#(import)#

@ -43,7 +43,6 @@ import net.yacy.kelondro.util.FileUtils;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.SitemapImporter;
import de.anomic.crawler.ZURL;
import de.anomic.crawler.retrieval.Request;
import de.anomic.data.bookmarksDB;
import de.anomic.data.listManager;
@ -300,7 +299,7 @@ public class WatchCrawler_p {
prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
prop.putHTML("info_reasonString", reasonString);
final ZURL.Entry ee = sb.crawlQueues.errorURL.newEntry(
sb.crawlQueues.errorURL.push(
new Request(
sb.peers.mySeed().hash,
crawlingStartURL,
@ -316,9 +315,6 @@ public class WatchCrawler_p {
new Date(),
1,
reasonString);
ee.store();
sb.crawlQueues.errorURL.push(ee);
}
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); //crawlfilter does not match url

@ -58,10 +58,11 @@
</ul>
</li>
<li class="menugroup" id="menugroupCrawlerControl">
<h3>Web&nbsp;Indexing</h3>
<h3>Index&nbsp;Control</h3>
<ul class="menu">
<li><a href="/WatchCrawler_p.html?autoforward=" class="MenuItemLink lock">Crawl Start &amp; Monitoring</a></li>
<li><a href="/WatchCrawler_p.html?autoforward=" class="MenuItemLink lock">Web Crawler</a></li>
<li><a href="/CrawlResults.html?process=5&amp;autoforward=" class="MenuItemLink">Crawl Results</a></li>
<li><a href="/ContentIntegrationPHPBB3_p.html" class="MenuItemLink lock">External Content</a></li>
<li><a href="/IndexControlRWIs_p.html" class="MenuItemLink lock">Index Administration</a></li>
<li><a href="/Blacklist_p.html" class="MenuItemLink lock">Filter &amp; Blacklists</a></li>
<li><a href="/ProxyIndexingMonitor_p.html" class="MenuItemLink lock">Indexing with Proxy</a></li>

@ -0,0 +1,8 @@
<div class="SubMenu">
<h3>External Content Integration</h3>
<ul class="SubMenu">
<li><a href="/ContentIntegrationPHPBB3_p.html" class="MenuItemLink lock">Import phpBB3 forum</a></li>
<li><a href="/IndexImportWikimedia_p.html" class="MenuItemLink lock">Import Wikimedia dumps</a></li>
<li><a href="/IndexImportOAIPMH_p.html" class="MenuItemLink lock">Import OAI-PMH Sources</a></li>
</ul>
</div>

@ -1,39 +1,32 @@
<div class="SubMenu">
<h3>Content Integration</h3>
<h3>Web Crawler</h3>
</div>
<div class="SubMenu">
<div class="SubMenugroup">
<h3>Web Crawler</h3>
<h3>Crawler Steering</h3>
<ul class="SubMenu">
<li><a href="/CrawlStart_p.html" class="MenuItemLink lock">Crawl Start</a></li>
<li><a href="/CrawlProfileEditor_p.html" class="MenuItemLink lock">Crawl Profile Editor</a></li>
<li><a href="/WatchCrawler_p.html" class="MenuItemLink lock">Crawler Queues</a></li>
</ul>
</div>
<div class="SubMenugroup">
<h3>URLs to be processed</h3>
<ul class="SubMenu">
<li><a href="/IndexCreateWWWLocalQueue_p.html" class="MenuItemLink lock">Local</a></li>
<li><a href="/IndexCreateWWWGlobalQueue_p.html" class="MenuItemLink lock">Global</a></li>
<li><a href="/IndexCreateWWWRemoteQueue_p.html" class="MenuItemLink lock">Remote</a></li>
<!--<li><a href="/IndexCreateWWWOverhangQueue_p.html" class="MenuItemLink"><em class="lock">Overhang</em></a></li>-->
</ul>
</div>
<div class="SubMenugroup">
<h3>Processing Queues</h3>
<h3>Processing Monitor</h3>
<ul class="SubMenu">
<li><a href="/WatchCrawler_p.html" class="MenuItemLink lock">Crawler Queues</a></li>
<li><a href="/IndexCreateLoaderQueue_p.html" class="MenuItemLink lock">Loader</a></li>
<li><a href="/IndexCreateParserErrors_p.html" class="MenuItemLink lock">Parser Errors</a></li>
</ul>
</div>
<div class="SubMenugroup">
<h3>Database Import</h3>
<h3>Queues</h3>
<ul class="SubMenu">
<li><a href="/ContentIntegrationPHPBB3_p.html" class="MenuItemLink lock">phpBB3 forum</a></li>
<li><a href="/IndexCreateWWWLocalQueue_p.html" class="MenuItemLink lock">Local</a></li>
<li><a href="/IndexCreateWWWGlobalQueue_p.html" class="MenuItemLink lock">Global</a></li>
<li><a href="/IndexCreateWWWRemoteQueue_p.html" class="MenuItemLink lock">Remote</a></li>
<!--<li><a href="/IndexCreateWWWOverhangQueue_p.html" class="MenuItemLink"><em class="lock">Overhang</em></a></li>-->
</ul>
</div>

@ -6,6 +6,5 @@
<li><a href="/ConfigSearchBox.html" class="MenuItemLink">Search Box Anywhere</a></li>
<li><a href="/ConfigWikiSearch.html" class="MenuItemLink">Search Integration for Wikis</a></li>
<li><a href="/ConfigPHPBB3Search.html" class="MenuItemLink">Search Integration for phpBB3</a></li>
<li><a href="/IndexImportWikimedia_p.html" class="MenuItemLink lock">Wikimedia Dump Import</a></li>
</ul>
</div>

@ -32,7 +32,6 @@ import java.io.IOException;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.ZURL;
import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Segments;
@ -154,14 +153,12 @@ public final class crawlReceipt {
}
sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work is transformed into an error case
final ZURL.Entry ee = sb.crawlQueues.errorURL.newEntry(
sb.crawlQueues.errorURL.push(
entry.toBalancerEntry(iam),
youare,
null,
0,
result + ":" + reason);
ee.store();
sb.crawlQueues.errorURL.push(ee);
//switchboard.noticeURL.remove(receivedUrlhash);
prop.put("delay", "3600");
return prop;

@ -81,13 +81,11 @@ public class urls {
// place url to notice-url db
sb.crawlQueues.delegatedURL.push(
sb.crawlQueues.delegatedURL.newEntry(
entry,
sb.peers.mySeed().hash,
new Date(),
0,
"client=____________")
);
"client=____________");
// create RSS entry
prop.put("item_" + c + "_title", "");

@ -545,14 +545,12 @@ public class CrawlQueues {
this.request.setStatus("worker-checkingrobots", WorkflowJob.STATUS_STARTED);
if ((request.url().getProtocol().equals("http") || request.url().getProtocol().equals("https")) && sb.robots.isDisallowed(request.url())) {
if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt.");
final ZURL.Entry eentry = errorURL.newEntry(
errorURL.push(
this.request,
sb.peers.mySeed().hash,
new Date(),
1,
"denied by robots.txt");
eentry.store();
errorURL.push(eentry);
this.request.setStatus("worker-disallowed", WorkflowJob.STATUS_FINISHED);
} else {
// starting a load from the internet
@ -581,28 +579,24 @@ public class CrawlQueues {
}
if (result != null) {
final ZURL.Entry eentry = errorURL.newEntry(
errorURL.push(
this.request,
sb.peers.mySeed().hash,
new Date(),
1,
"cannot load: " + result);
eentry.store();
errorURL.push(eentry);
this.request.setStatus("worker-error", WorkflowJob.STATUS_FINISHED);
} else {
this.request.setStatus("worker-processed", WorkflowJob.STATUS_FINISHED);
}
}
} catch (final Exception e) {
final ZURL.Entry eentry = errorURL.newEntry(
errorURL.push(
this.request,
sb.peers.mySeed().hash,
new Date(),
1,
e.getMessage() + " - in worker");
eentry.store();
errorURL.push(eentry);
e.printStackTrace();
Client.initConnectionManager();
this.request.setStatus("worker-exception", WorkflowJob.STATUS_FINISHED);

@ -136,9 +136,7 @@ public final class CrawlStacker {
// if the url was rejected we store it into the error URL db
if (rejectReason != null) {
final ZURL.Entry ee = nextQueue.errorURL.newEntry(entry, peers.mySeed().hash, new Date(), 1, rejectReason);
ee.store();
nextQueue.errorURL.push(ee);
nextQueue.errorURL.push(entry, peers.mySeed().hash, new Date(), 1, rejectReason);
}
} catch (final Exception e) {
CrawlStacker.this.log.logWarning("Error while processing stackCrawl entry.\n" + "Entry: " + entry.toString() + "Error: " + e.toString(), e);

@ -30,7 +30,7 @@ import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.concurrent.ConcurrentLinkedQueue;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word;
@ -44,10 +44,10 @@ import net.yacy.kelondro.util.FileUtils;
import de.anomic.crawler.retrieval.Request;
public class ZURL {
public class ZURL implements Iterable<ZURL.Entry> {
private static final int EcoFSBufferSize = 200;
private static final int maxStackSize = 300;
private static final int EcoFSBufferSize = 2000;
private static final int maxStackSize = 1000;
public final static Row rowdef = new Row(
"String urlhash-" + Word.commonHashLength + ", " + // the url's hash
@ -60,8 +60,8 @@ public class ZURL {
);
// the class object
protected final ObjectIndex urlIndex;
private final LinkedList<String> stack;
private final ObjectIndex urlIndex;
private final ConcurrentLinkedQueue<String> stack;
public ZURL(
final File cachePath,
@ -79,13 +79,13 @@ public class ZURL {
}
this.urlIndex = new Table(f, rowdef, EcoFSBufferSize, 0, useTailCache, exceed134217727);
//urlIndex = new kelondroFlexTable(cachePath, tablename, -1, rowdef, 0, true);
this.stack = new LinkedList<String>();
this.stack = new ConcurrentLinkedQueue<String>();
}
public ZURL() {
// creates a new ZUR in RAM
this.urlIndex = new RowSet(rowdef, 0);
this.stack = new LinkedList<String>();
this.stack = new ConcurrentLinkedQueue<String>();
}
public int size() {
@ -102,18 +102,6 @@ public class ZURL {
if (urlIndex != null) urlIndex.close();
}
public synchronized Entry newEntry(
final Request bentry,
final String executor,
final Date workdate,
final int workcount,
String anycause) {
assert executor != null;
assert executor.length() > 0;
if (anycause == null) anycause = "unknown";
return new Entry(bentry, executor, workdate, workcount, anycause);
}
public boolean remove(final String hash) {
if (hash == null) return false;
//System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " remove " + hash);
@ -125,22 +113,45 @@ public class ZURL {
}
}
public synchronized void push(final Entry e) {
stack.add(e.hash());
while (stack.size() > maxStackSize) stack.removeFirst();
public void push(
final Request bentry,
final String executor,
final Date workdate,
final int workcount,
String anycause) {
assert executor != null;
assert executor.length() > 0;
if (anycause == null) anycause = "unknown";
Entry entry = new Entry(bentry, executor, workdate, workcount, anycause);
entry.store();
stack.add(entry.hash());
while (stack.size() > maxStackSize) stack.poll();
}
public Entry top(final int pos) {
String urlhash;
synchronized (stack) {
if (pos >= stack.size()) return null;
urlhash = stack.get(pos);
public Iterator<ZURL.Entry> iterator() {
return new EntryIterator();
}
private class EntryIterator implements Iterator<ZURL.Entry> {
private Iterator<String> hi;
public EntryIterator() {
this.hi = stack.iterator();
}
public boolean hasNext() {
return hi.hasNext();
}
if (urlhash == null) return null;
return getEntry(urlhash);
public ZURL.Entry next() {
return getEntry(hi.next());
}
public void remove() {
hi.remove();
}
}
public synchronized Entry getEntry(final String urlhash) {
public ZURL.Entry getEntry(final String urlhash) {
try {
if (urlIndex == null) return null;
//System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " get " + urlhash);
@ -174,7 +185,7 @@ public class ZURL {
private final String anycause; // string describing reason for load fail
private boolean stored;
public Entry(
private Entry(
final Request bentry,
final String executor,
final Date workdate,
@ -191,7 +202,7 @@ public class ZURL {
stored = false;
}
public Entry(final Row.Entry entry) throws IOException {
private Entry(final Row.Entry entry) throws IOException {
assert (entry != null);
this.executor = entry.getColString(1, "UTF-8");
this.workdate = new Date(entry.getColLong(2));
@ -203,7 +214,7 @@ public class ZURL {
return;
}
public void store() {
protected void store() {
// stores the values from the object variables into the database
if (this.stored) return;
if (this.bentry == null) return;
@ -289,5 +300,6 @@ public class ZURL {
// enumerates entry elements
return new kiter(up, firstHash);
}
}

@ -144,7 +144,7 @@ public class FTPLoader {
if (berr.size() > 0 || response == null) {
// some error logging
final String detail = (berr.size() > 0) ? "\n Errorlog: " + berr.toString() : "";
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "server download" + detail);
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash, new Date(), 1, "server download" + detail);
throw new IOException("FTPLoader: Unable to download URL " + request.url().toString() + detail);
}
@ -227,7 +227,7 @@ public class FTPLoader {
if (supportError != null) {
// reject file
log.logInfo("PARSER REJECTED URL " + request.url().toString() + ": " + supportError);
sb.crawlQueues.errorURL.newEntry(request, this.sb.peers.mySeed().hash, new Date(), 1, supportError);
sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash, new Date(), 1, supportError);
throw new Exception(supportError);
} else {
// abort the download if content is too long
@ -257,7 +257,7 @@ public class FTPLoader {
b);
} else {
log.logInfo("REJECTED TOO BIG FILE with size " + size + " Bytes for URL " + request.url().toString());
sb.crawlQueues.errorURL.newEntry(request, this.sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");
sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new Exception("file size exceeds limit");
}
}

@ -85,7 +85,7 @@ public final class HTTPLoader {
private Response load(final Request request, boolean acceptOnlyParseable, final int retryCount) throws IOException {
if (retryCount < 0) {
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "redirection counter exceeded").store();
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash, new Date(), 1, "redirection counter exceeded");
throw new IOException("Redirection counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
}
@ -100,7 +100,7 @@ public final class HTTPLoader {
if (acceptOnlyParseable) {
String supportError = TextParser.supportsExtension(request.url());
if (supportError != null) {
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, supportError);
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash, new Date(), 1, supportError);
throw new IOException("REJECTED WRONG EXTENSION TYPE: " + supportError);
}
}
@ -108,7 +108,7 @@ public final class HTTPLoader {
// check if url is in blacklist
final String hostlow = host.toLowerCase();
if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) {
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "url in blacklist").store();
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash, new Date(), 1, "url in blacklist");
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
}
@ -143,7 +143,7 @@ public final class HTTPLoader {
// if the response has not the right file type then reject file
String supportError = TextParser.supports(request.url(), res.getResponseHeader().mime());
if (supportError != null) {
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, supportError);
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash, new Date(), 1, supportError);
throw new IOException("REJECTED WRONG MIME TYPE: " + supportError);
}
}
@ -155,7 +155,7 @@ public final class HTTPLoader {
// check length again in case it was not possible to get the length before loading
if (maxFileSize > 0 && contentLength > maxFileSize) {
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
}
@ -177,7 +177,7 @@ public final class HTTPLoader {
redirectionUrlString = redirectionUrlString.trim();
if (redirectionUrlString.length() == 0) {
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "redirection header empy");
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash, new Date(), 1, "redirection header empy");
throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty.");
}
@ -190,7 +190,7 @@ public final class HTTPLoader {
// if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) {
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "server shutdown");
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash, new Date(), 1, "server shutdown");
throw new IOException("CRAWLER Retry of URL=" + request.url().toString() + " aborted because of server shutdown.");
}
@ -200,7 +200,7 @@ public final class HTTPLoader {
// check if the url was already indexed
final String dbname = sb.urlExists(Segments.Process.LOCALCRAWLING, urlhash);
if (dbname != null) {
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "redirection to double content");
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash, new Date(), 1, "redirection to double content");
throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname);
}
@ -210,7 +210,7 @@ public final class HTTPLoader {
}
} else {
// if the response has not the right response type then reject file
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "wrong http status code " + res.getStatusCode() + ")");
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash, new Date(), 1, "wrong http status code " + res.getStatusCode() + ")");
throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + request.url().toString());
}
} finally {

@ -77,7 +77,7 @@ public final class MetadataRepository implements Iterable<byte[]> {
final boolean useTailCache,
final boolean exceed134217727) {
this.location = path;
this.urlIndexFile = new Cache(new SplitTable(this.location, tablename, URIMetadataRow.rowdef, useTailCache, exceed134217727));
this.urlIndexFile = new Cache(new SplitTable(this.location, tablename, URIMetadataRow.rowdef, useTailCache, exceed134217727), 20000000, 20000000);
this.exportthread = null; // will have a export thread assigned if exporter is running
this.statsDump = null;
}

@ -116,7 +116,7 @@ import net.yacy.document.TextParser;
import net.yacy.document.ParserException;
import net.yacy.document.content.DCEntry;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.content.file.SurrogateReader;
import net.yacy.document.content.SurrogateReader;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.meta.DigestURI;
@ -151,7 +151,6 @@ import de.anomic.crawler.ResourceObserver;
import de.anomic.crawler.ResultImages;
import de.anomic.crawler.ResultURLs;
import de.anomic.crawler.RobotsTxt;
import de.anomic.crawler.ZURL;
import de.anomic.crawler.CrawlProfile.entry;
import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.crawler.retrieval.HTTPLoader;
@ -1998,13 +1997,7 @@ public final class Switchboard extends serverSwitch {
0,
0,
0);
final ZURL.Entry ee = crawlQueues.errorURL.newEntry(
bentry, initiator, new Date(),
0, failreason);
// store the entry
ee.store();
// push it onto the stack
crawlQueues.errorURL.push(ee);
crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failreason);
}
public int currentPPM() {

@ -22,7 +22,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.content.file;
package net.yacy.document.content;
import java.io.BufferedInputStream;
import java.io.File;
@ -36,7 +36,6 @@ import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.document.content.DCEntry;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;

@ -31,8 +31,9 @@ import java.io.IOException;
import java.net.MalformedURLException;
import net.yacy.document.content.DCEntry;
import net.yacy.document.content.file.SurrogateReader;
import net.yacy.document.content.SurrogateReader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
@ -93,6 +94,7 @@ public class OAIPMHImporter extends Thread implements Importer {
}
private static void load(Response response) throws IOException {
//FileUtils.copy(source, dest)
byte[] b = response.getContent();
SurrogateReader sr = new SurrogateReader(new ByteArrayInputStream(b), 100);
Thread srt = new Thread(sr);
@ -104,6 +106,10 @@ public class OAIPMHImporter extends Thread implements Importer {
try {
srt.join();
} catch (InterruptedException e) {}
ResumptionTokenReader rtr = new ResumptionTokenReader(new ByteArrayInputStream(b));
ResumptionToken token = rtr.getToken();
System.out.println("TOKEN: " + token.toString());
}
public static StringBuilder escape(final String s) {

@ -0,0 +1,82 @@
package net.yacy.document.importer;
import java.text.Collator;
import java.text.ParseException;
import java.util.Date;
import java.util.Locale;
import java.util.TreeMap;
import net.yacy.kelondro.util.DateFormatter;
public class ResumptionToken extends TreeMap<String, String> {
private static final long serialVersionUID = -8389462290545629792L;
// use a collator to relax when distinguishing between lowercase und uppercase letters
private static final Collator insensitiveCollator = Collator.getInstance(Locale.US);
static {
insensitiveCollator.setStrength(Collator.SECONDARY);
insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
}
public ResumptionToken(
Date expirationDate,
int completeListSize,
int cursor,
int token
) {
super((Collator) insensitiveCollator.clone());
this.put("expirationDate", DateFormatter.formatISO8601(expirationDate));
this.put("completeListSize", Integer.toString(completeListSize));
this.put("cursor", Integer.toString(cursor));
this.put("token", Integer.toString(token));
}
public ResumptionToken(
String expirationDate,
int completeListSize,
int cursor,
int token
) {
super((Collator) insensitiveCollator.clone());
this.put("expirationDate", expirationDate);
this.put("completeListSize", Integer.toString(completeListSize));
this.put("cursor", Integer.toString(cursor));
this.put("token", Integer.toString(token));
}
public Date getExpirationDate() {
String d = this.get("expirationDate");
if (d == null) return null;
try {
return DateFormatter.parseISO8601(d);
} catch (ParseException e) {
e.printStackTrace();
return new Date();
}
}
public int getCompleteListSize() {
String t = this.get("completeListSize");
if (t == null) return 0;
return Integer.parseInt(t);
}
public int getCursor() {
String t = this.get("cursor");
if (t == null) return 0;
return Integer.parseInt(t);
}
public int getToken() {
String t = this.get("token");
if (t == null) return 0;
return Integer.parseInt(t);
}
public String toString() {
return "expirationDate=" + DateFormatter.formatISO8601(this.getExpirationDate()) + ", completeListSize=" + getCompleteListSize() +
", cursor=" + this.getCursor() + ", token=" + this.getToken();
}
}

@ -0,0 +1,90 @@
package net.yacy.document.importer;
import java.io.IOException;
import java.io.InputStream;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class ResumptionTokenReader extends DefaultHandler {
// class variables
private final StringBuilder buffer;
private boolean parsingValue;
private ResumptionToken token;
private SAXParser saxParser;
private InputStream stream;
private Attributes atts;
public ResumptionTokenReader(final InputStream stream) throws IOException {
this.buffer = new StringBuilder();
this.parsingValue = false;
this.token = null;
this.stream = stream;
this.atts = null;
final SAXParserFactory factory = SAXParserFactory.newInstance();
try {
this.saxParser = factory.newSAXParser();
this.saxParser.parse(this.stream, this);
} catch (SAXException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ParserConfigurationException e) {
e.printStackTrace();
throw new IOException(e.getMessage());
} finally {
try {
this.stream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public ResumptionToken getToken() {
return this.token;
}
/*
<resumptionToken expirationDate="2009-10-31T22:52:14Z"
completeListSize="226"
cursor="0">688</resumptionToken>
*/
public void run() {
}
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
if ("resumptionToken".equals(tag)) {
this.parsingValue = true;
this.atts = atts;
}
}
public void endElement(final String uri, final String name, final String tag) {
if (tag == null) return;
if ("resumptionToken".equals(tag)) {
this.token = new ResumptionToken(
atts.getValue("expirationDate"),
Integer.parseInt(atts.getValue("completeListSize")),
Integer.parseInt(atts.getValue("cursor")),
Integer.parseInt(buffer.toString().trim()));
this.buffer.setLength(0);
this.parsingValue = false;
}
}
public void characters(final char ch[], final int start, final int length) {
if (parsingValue) {
buffer.append(ch, start, length);
}
}
}

@ -145,10 +145,10 @@ public class MapView {
assert s != null;
synchronized (this) {
// write entry
blob.put(key.getBytes("UTF-8"), s.getBytes("UTF-8"));
if (blob != null) blob.put(key.getBytes("UTF-8"), s.getBytes("UTF-8"));
// write map to cache
cache.put(key, newMap);
if (cache != null) cache.put(key, newMap);
}
}

@ -63,9 +63,19 @@ public final class Cache implements ObjectIndex, Iterable<Row.Entry> {
private Row keyrow;
private int readHit, readMiss, writeUnique, writeDouble, cacheDelete, cacheFlush;
private int hasnotHit, hasnotMiss, hasnotUnique, hasnotDouble, hasnotDelete;
private int hitLimit, missLimit;
public Cache(final ObjectIndex backupIndex) {
/**
* create a ObjectIndex cache. The cache may either limited by a number of entries in the hit/miss cache
* or the cache size can only be limited by the available RAM
* @param backupIndex the ObjectIndex that is cached
* @param hitLimit a limit of cache hit entries. If given as value <= 0, then only the RAM limits the size
* @param missLimit a limit of cache miss entries. If given as value <= 0, then only the RAM limits the size
*/
public Cache(final ObjectIndex backupIndex, int hitLimit, int missLimit) {
this.index = backupIndex;
this.hitLimit = hitLimit;
this.missLimit = missLimit;
init();
objectTracker.put(backupIndex.filename(), this);
}
@ -103,6 +113,14 @@ public final class Cache implements ObjectIndex, Iterable<Row.Entry> {
return memStartShrink ;
}
public final int getHitLimit() {
return this.hitLimit;
}
public final int getMissLimit() {
return this.missLimit;
}
public static final Iterator<String> filenames() {
// iterates string objects; all file names from record tracker
return objectTracker.keySet().iterator();
@ -151,6 +169,11 @@ public final class Cache implements ObjectIndex, Iterable<Row.Entry> {
private final boolean checkMissSpace() {
// returns true if it is allowed to write into this cache
if (readMissCache == null) return false;
// check given limitation
if (this.missLimit > 0 && this.readMissCache.size() >= this.missLimit) return false;
// check memory
long available = MemoryControl.available();
if (available - 2 * 1024 * 1024 < readMissCache.memoryNeededForGrow()) {
readMissCache.clear();
@ -166,6 +189,11 @@ public final class Cache implements ObjectIndex, Iterable<Row.Entry> {
private final boolean checkHitSpace() {
// returns true if it is allowed to write into this cache
if (readHitCache == null) return false;
// check given limitation
if (this.hitLimit > 0 && this.readHitCache.size() >= this.hitLimit) return false;
// check memory
long available = MemoryControl.available();
if (available - 2 * 1024 * 1024 < readHitCache.memoryNeededForGrow()) {
readHitCache.clear();

Loading…
Cancel
Save