From 30888e7a2fdf491377843b4919e7471b5d2deab9 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 23 Nov 2006 02:16:30 +0000 Subject: [PATCH] implementation of search constraints Such constraints may formulate specific restrictions to web searches This is implemented by scraping information for constraints from a web page during parsing, and storing flags to the pages within the web index. In this first step, only information for index pages ("index of", directory listings) are scraped and stored in flags - added new flag class kelondroBitfield - added scraper method in condenser - added bitfield structure for all scrape types (see also condenser) - added bitfield structure for appearance locations (see RWIEntry) - added handover protocol for remote search and index distribution - extended kelondroColumn class to hold bitfield types - added another search attribute on search page (index.html) - extended search-filter to enable filtering of non-matching constraints - set all new database types to be default - refactoring: moved word hash generation to condenser class git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2999 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- build.properties | 2 +- htroot/DetailedSearch.java | 2 +- htroot/IndexControl_p.java | 11 +- htroot/IndexCreate_p.java | 6 +- htroot/env/templates/simpleheader.template | 2 +- htroot/htdocsdefault/dir.java | 6 +- htroot/index.html | 9 + htroot/index.java | 1 + htroot/xml/snippet.java | 3 +- htroot/yacy/crawlReceipt.java | 4 +- htroot/yacy/search.java | 11 +- htroot/yacysearch.html | 1 + htroot/yacysearch.java | 14 +- source/de/anomic/data/bookmarksDB.java | 3 +- source/de/anomic/index/indexRWIEntry.java | 3 +- source/de/anomic/index/indexRWIEntryNew.java | 64 ++++--- source/de/anomic/index/indexRWIEntryOld.java | 6 +- source/de/anomic/index/indexURLEntryNew.java | 19 +- source/de/anomic/index/indexURLEntryOld.java | 4 +- .../anomic/kelondro/kelondroBase64Order.java | 8 +- .../de/anomic/kelondro/kelondroBitfield.java | 166 ++++++++++++++++++ source/de/anomic/kelondro/kelondroColumn.java | 10 ++ .../kelondro/kelondroFlexWidthArray.java | 2 +- source/de/anomic/kelondro/kelondroRow.java | 24 ++- .../plasma/crawler/AbstractCrawlWorker.java | 4 +- source/de/anomic/plasma/plasmaCondenser.java | 159 ++++++++++------- source/de/anomic/plasma/plasmaCrawlEURL.java | 12 +- source/de/anomic/plasma/plasmaCrawlLURL.java | 4 +- source/de/anomic/plasma/plasmaCrawlNURL.java | 16 +- .../de/anomic/plasma/plasmaCrawlStacker.java | 16 +- .../anomic/plasma/plasmaRankingCRProcess.java | 18 +- .../de/anomic/plasma/plasmaSearchEvent.java | 4 +- .../anomic/plasma/plasmaSearchPreOrder.java | 1 + .../de/anomic/plasma/plasmaSearchQuery.java | 34 ++-- .../plasma/plasmaSearchRankingProfile.java | 4 +- .../de/anomic/plasma/plasmaSearchResult.java | 2 +- .../de/anomic/plasma/plasmaSnippetCache.java | 4 +- .../de/anomic/plasma/plasmaSwitchboard.java | 26 +-- source/de/anomic/plasma/plasmaURL.java | 5 - source/de/anomic/plasma/plasmaWordIndex.java | 11 +- source/de/anomic/yacy/yacyClient.java | 10 +- source/de/anomic/yacy/yacySearch.java | 20 ++- source/yacy.java | 10 +- yacy.init | 6 +- 44 files changed, 506 insertions(+), 241 deletions(-) create mode 100644 source/de/anomic/kelondro/kelondroBitfield.java diff --git a/build.properties b/build.properties index c3ff07c26..6aa6c2d48 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.4 javacTarget=1.4 # Release Configuration -releaseVersion=0.486 +releaseVersion=0.487 releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz #releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr} diff --git a/htroot/DetailedSearch.java b/htroot/DetailedSearch.java index f658e796e..0c613969f 100644 --- a/htroot/DetailedSearch.java +++ b/htroot/DetailedSearch.java @@ -137,7 +137,7 @@ public class DetailedSearch { // do the search plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, wdist, "", count, searchtime, urlmask, ((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL, - "", 20); + "", 20, plasmaSearchQuery.catchall_constraint); plasmaSearchRankingProfile localRanking = new plasmaSearchRankingProfile("local", post.toString()); plasmaSearchTimingProfile localTiming = new plasmaSearchTimingProfile(4 * thisSearch.maximumTime / 10, thisSearch.wantedResults); plasmaSearchTimingProfile remoteTiming = new plasmaSearchTimingProfile(6 * thisSearch.maximumTime / 10, thisSearch.wantedResults); diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 6fd6491f3..92d81c537 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -61,6 +61,7 @@ import de.anomic.index.indexRWIEntry; import de.anomic.plasma.plasmaURL; import de.anomic.index.indexURLEntry; import de.anomic.net.URL; +import de.anomic.plasma.plasmaCondenser; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.urlPattern.plasmaURLPattern; @@ -175,7 +176,7 @@ public class IndexControl_p { switchboard.wordIndex.deleteContainer(keyhash); post.remove("keyhashdeleteall"); if (keystring.length() > 0 && - plasmaURL.word2hash(keystring).equals(keyhash)) { + plasmaCondenser.word2hash(keystring).equals(keyhash)) { post.put("keystringsearch", "generated"); } else { post.put("keyhashsearch", "generated"); @@ -198,7 +199,7 @@ public class IndexControl_p { // this shall lead to a presentation of the list; so handle that the remaining program // thinks that it was called for a list presentation post.remove("keyhashdelete"); - if (keystring.length() > 0 && plasmaURL.word2hash(keystring).equals(keyhash)) { + if (keystring.length() > 0 && plasmaCondenser.word2hash(keystring).equals(keyhash)) { post.put("keystringsearch", "generated"); } else { post.put("keyhashsearch", "generated"); @@ -228,7 +229,7 @@ public class IndexControl_p { } if (post.containsKey("keystringsearch")) { - keyhash = plasmaURL.word2hash(keystring); + keyhash = plasmaCondenser.word2hash(keystring); prop.put("keyhash", keyhash); prop.put("urlstring", ""); prop.put("urlhash", ""); @@ -236,7 +237,7 @@ public class IndexControl_p { } if (post.containsKey("keyhashsearch")) { - if (keystring.length() == 0 || !plasmaURL.word2hash(keystring).equals(keyhash)) { + if (keystring.length() == 0 || !plasmaCondenser.word2hash(keystring).equals(keyhash)) { prop.put("keystring", ""); } prop.put("urlstring", ""); @@ -246,7 +247,7 @@ public class IndexControl_p { // transfer to other peer if (post.containsKey("keyhashtransfer")) { - if (keystring.length() == 0 || !plasmaURL.word2hash(keystring).equals(keyhash)) { + if (keystring.length() == 0 || !plasmaCondenser.word2hash(keystring).equals(keyhash)) { prop.put("keystring", ""); } prop.put("urlstring", ""); diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index a92d513f6..b737083a2 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -59,6 +59,7 @@ import de.anomic.data.wikiCode; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterWriter; import de.anomic.http.httpHeader; +import de.anomic.kelondro.kelondroBitfield; import de.anomic.plasma.plasmaURL; import de.anomic.net.URL; import de.anomic.plasma.plasmaCrawlEURL; @@ -68,7 +69,6 @@ import de.anomic.server.serverFileUtils; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.server.serverThread; -import de.anomic.tools.bitfield; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacyNewsPool; import de.anomic.yacy.yacyNewsRecord; @@ -204,7 +204,7 @@ public class IndexCreate_p { prop.put("error_reasonString", reasonString); plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, - crawlingStartURL.getHost(), reasonString, new bitfield()); + crawlingStartURL.getHost(), reasonString, new kelondroBitfield()); ee.store(); switchboard.urlPool.errorURL.stackPushEntry(ee); } @@ -282,7 +282,7 @@ public class IndexCreate_p { c++; } else { plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, - (String) e.getValue(), rejectReason, new bitfield()); + (String) e.getValue(), rejectReason, new kelondroBitfield()); ee.store(); switchboard.urlPool.errorURL.stackPushEntry(ee); } diff --git a/htroot/env/templates/simpleheader.template b/htroot/env/templates/simpleheader.template index eb4d42d4c..4d2465b26 100644 --- a/htroot/env/templates/simpleheader.template +++ b/htroot/env/templates/simpleheader.template @@ -1,6 +1,6 @@