From a0e891c63d7a195e8881496fa5d301d347287575 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sat, 31 Oct 2009 11:58:06 +0000 Subject: [PATCH] - some redesign in UI menu structure to make room for new 'Content Integration' main menu containing import servlets for Wikimedia Dumps, phpbb3 forum imports and OAI-PMH imports - extended the OAI-PMH test applet and integrated it into the menu. Does still not import OAI-PMH records, but shows that it is able to read and parse this data - some redesign in ZURL storage: refactoring of access methods, better concurrency, less synchronization - added a limitation to the LURL metadata database table cache to 20 million entries: this cache was until now not limited and only limited by the available RAM which may have caused a memory-leak-like behavior. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6440 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/ConfigLiveSearch.html | 2 +- htroot/ConfigPHPBB3Search.html | 2 +- htroot/ConfigPortal.html | 2 +- htroot/ConfigSearchBox.html | 2 +- htroot/ConfigWikiSearch.html | 2 +- htroot/ContentIntegrationPHPBB3_p.html | 2 +- htroot/IndexCreateParserErrors_p.java | 4 +- htroot/IndexImportOAIPMH_p.html | 6 +- htroot/IndexImportOAIPMH_p.java | 7 +- htroot/IndexImportWikimedia_p.html | 2 +- htroot/WatchCrawler_p.java | 6 +- htroot/env/templates/header.template | 5 +- .../submenuContentIntegration.template | 8 ++ .../env/templates/submenuIndexCreate.template | 25 ++---- ...late => submenuPortalIntegration.template} | 1 - htroot/yacy/crawlReceipt.java | 5 +- htroot/yacy/urls.java | 4 +- source/de/anomic/crawler/CrawlQueues.java | 12 +-- source/de/anomic/crawler/CrawlStacker.java | 4 +- source/de/anomic/crawler/ZURL.java | 80 ++++++++++------- .../anomic/crawler/retrieval/FTPLoader.java | 6 +- .../anomic/crawler/retrieval/HTTPLoader.java | 18 ++-- .../de/anomic/search/MetadataRepository.java | 2 +- source/de/anomic/search/Switchboard.java | 11 +-- .../content/{file => }/SurrogateReader.java | 3 +- .../document/importer/OAIPMHImporter.java | 8 +- .../document/importer/ResumptionToken.java | 82 +++++++++++++++++ .../importer/ResumptionTokenReader.java | 90 +++++++++++++++++++ source/net/yacy/kelondro/blob/MapView.java | 4 +- source/net/yacy/kelondro/index/Cache.java | 30 ++++++- 30 files changed, 313 insertions(+), 122 deletions(-) create mode 100644 htroot/env/templates/submenuContentIntegration.template rename htroot/env/templates/{submenuIntegration.template => submenuPortalIntegration.template} (82%) rename source/net/yacy/document/content/{file => }/SurrogateReader.java (96%) create mode 100644 source/net/yacy/document/importer/ResumptionToken.java create mode 100644 source/net/yacy/document/importer/ResumptionTokenReader.java diff --git a/htroot/ConfigLiveSearch.html b/htroot/ConfigLiveSearch.html index a87b72766..90d7707e9 100644 --- a/htroot/ConfigLiveSearch.html +++ b/htroot/ConfigLiveSearch.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuIntegration.template%# + #%env/templates/submenuPortalIntegration.template%#

Integration of a Search Field for Live Search

A 'Live-Search' input field that reacts as search-as-you-type in a pop-up window can easily be integrated in any web page. diff --git a/htroot/ConfigPHPBB3Search.html b/htroot/ConfigPHPBB3Search.html index d49bd5c13..39013786c 100644 --- a/htroot/ConfigPHPBB3Search.html +++ b/htroot/ConfigPHPBB3Search.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuIntegration.template%# + #%env/templates/submenuPortalIntegration.template%#

Integration in phpBB3

It is possible to insert forum pages into the YaCy index using a databse import of forum postings. diff --git a/htroot/ConfigPortal.html b/htroot/ConfigPortal.html index 26c17f439..ace22336b 100644 --- a/htroot/ConfigPortal.html +++ b/htroot/ConfigPortal.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuIntegration.template%# + #%env/templates/submenuPortalIntegration.template%#

Integration of a Search Portal

If you like to integrate YaCy as portal for your web pages, you may want to change icons and messages on the search page. diff --git a/htroot/ConfigSearchBox.html b/htroot/ConfigSearchBox.html index 197a26cf8..f3f7f0892 100644 --- a/htroot/ConfigSearchBox.html +++ b/htroot/ConfigSearchBox.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuIntegration.template%# + #%env/templates/submenuPortalIntegration.template%#

Integration of a Search Box

We give information how to integrate a search box on any web page that diff --git a/htroot/ConfigWikiSearch.html b/htroot/ConfigWikiSearch.html index e7ff3c408..d156e9e0e 100644 --- a/htroot/ConfigWikiSearch.html +++ b/htroot/ConfigWikiSearch.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuIntegration.template%# + #%env/templates/submenuPortalIntegration.template%#

Integration in MediaWiki

It is possible to insert wiki pages into the YaCy index using a web crawl on that pages. diff --git a/htroot/ContentIntegrationPHPBB3_p.html b/htroot/ContentIntegrationPHPBB3_p.html index d4b137fe8..24bd0ba5f 100644 --- a/htroot/ContentIntegrationPHPBB3_p.html +++ b/htroot/ContentIntegrationPHPBB3_p.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuIndexCreate.template%# + #%env/templates/submenuContentIntegration.template%#

Content Integration: Retrieval from phpBB3 Databases

It is possible to extract texts directly from mySQL and postgreSQL databases. diff --git a/htroot/IndexCreateParserErrors_p.java b/htroot/IndexCreateParserErrors_p.java index 8e270f066..6f1146f08 100644 --- a/htroot/IndexCreateParserErrors_p.java +++ b/htroot/IndexCreateParserErrors_p.java @@ -69,11 +69,9 @@ public class IndexCreateParserErrors_p { dark = true; DigestURI url; String initiatorHash, executorHash; - ZURL.Entry entry; yacySeed initiatorSeed, executorSeed; int j=0; - for (int i = sb.crawlQueues.errorURL.stackSize() - 1; i >= (sb.crawlQueues.errorURL.stackSize() - showRejectedCount); i--) { - entry = sb.crawlQueues.errorURL.top(i); + for (ZURL.Entry entry: sb.crawlQueues.errorURL) { if (entry == null) continue; url = entry.url(); if (url == null) continue; diff --git a/htroot/IndexImportOAIPMH_p.html b/htroot/IndexImportOAIPMH_p.html index 0bcf4624c..127e95c9b 100644 --- a/htroot/IndexImportOAIPMH_p.html +++ b/htroot/IndexImportOAIPMH_p.html @@ -7,7 +7,7 @@ #%env/templates/header.template%# - #%env/templates/submenuIntegration.template%# + #%env/templates/submenuContentIntegration.template%#

OAI-PMH Import

#(import)# @@ -15,8 +15,8 @@
OAI-PMH Import: set a OAI-PMH URL - - + +
:: diff --git a/htroot/IndexImportOAIPMH_p.java b/htroot/IndexImportOAIPMH_p.java index c8e334eef..a8ce98502 100644 --- a/htroot/IndexImportOAIPMH_p.java +++ b/htroot/IndexImportOAIPMH_p.java @@ -22,7 +22,6 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -import java.io.File; import java.net.MalformedURLException; import net.yacy.document.importer.OAIPMHImporter; @@ -55,16 +54,16 @@ public class IndexImportOAIPMH_p { if (post == null) { prop.put("import_status", 0); } else { - if (post.containsKey("file")) { + if (post.containsKey("oaipmhurl")) { String oaipmhurl = post.get("oaipmhurl"); DigestURI url = null; try { url = new DigestURI(oaipmhurl, null); OAIPMHImporter.job = new OAIPMHImporter(sb.loader, url); OAIPMHImporter.job.start(); - prop.put("import", 1); + prop.put("import", 0); prop.put("import_thread", "started"); - prop.put("import_dump", OAIPMHImporter.job.source()); + prop.put("import_source", OAIPMHImporter.job.source()); prop.put("import_count", 0); prop.put("import_speed", 0); prop.put("import_runningHours", 0); diff --git a/htroot/IndexImportWikimedia_p.html b/htroot/IndexImportWikimedia_p.html index d46eea8bf..b25b43429 100644 --- a/htroot/IndexImportWikimedia_p.html +++ b/htroot/IndexImportWikimedia_p.html @@ -7,7 +7,7 @@ #%env/templates/header.template%# - #%env/templates/submenuIntegration.template%# + #%env/templates/submenuContentIntegration.template%#

Wikimedia Dump Import

#(import)# diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java index d0c2663f5..daa5cd862 100644 --- a/htroot/WatchCrawler_p.java +++ b/htroot/WatchCrawler_p.java @@ -43,7 +43,6 @@ import net.yacy.kelondro.util.FileUtils; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.SitemapImporter; -import de.anomic.crawler.ZURL; import de.anomic.crawler.retrieval.Request; import de.anomic.data.bookmarksDB; import de.anomic.data.listManager; @@ -300,7 +299,7 @@ public class WatchCrawler_p { prop.putHTML("info_crawlingURL", (post.get("crawlingURL"))); prop.putHTML("info_reasonString", reasonString); - final ZURL.Entry ee = sb.crawlQueues.errorURL.newEntry( + sb.crawlQueues.errorURL.push( new Request( sb.peers.mySeed().hash, crawlingStartURL, @@ -316,9 +315,6 @@ public class WatchCrawler_p { new Date(), 1, reasonString); - - ee.store(); - sb.crawlQueues.errorURL.push(ee); } } catch (final PatternSyntaxException e) { prop.put("info", "4"); //crawlfilter does not match url diff --git a/htroot/env/templates/header.template b/htroot/env/templates/header.template index 712e1770d..99a87c6dd 100644 --- a/htroot/env/templates/header.template +++ b/htroot/env/templates/header.template @@ -58,10 +58,11 @@