From 15397298dc44e68d85133c781212cada54f0e1d3 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 24 Jan 2008 22:49:00 +0000 Subject: [PATCH] - refactoring of indexControlRWIs: moved statics to own class; better Dublin Core naming - fix for http://forum.yacy-websuche.de/viewtopic.php?f=5&t=759&hilit=&p=4866#p4866 - some bugfixes in EcoTable according remove method - switched more tables to Eco: crawl Profiles, htcache, seeddb, newsdb git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4397 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexControlRWIs_p.html | 18 +- htroot/IndexControlRWIs_p.java | 174 +-------------- source/de/anomic/http/httpc.java | 7 +- source/de/anomic/kelondro/kelondroDyn.java | 6 +- .../de/anomic/kelondro/kelondroEcoTable.java | 47 ++-- .../kelondro/kelondroRotateIterator.java | 5 +- .../dbImport/plasmaCrawlNURLImporter.java | 8 +- .../de/anomic/plasma/plasmaCrawlProfile.java | 4 +- .../anomic/plasma/plasmaCrawlRobotsTxt.java | 34 +-- source/de/anomic/plasma/plasmaHTCache.java | 4 +- source/de/anomic/plasma/plasmaSearchAPI.java | 206 ++++++++++++++++++ .../de/anomic/plasma/plasmaSwitchboard.java | 6 +- source/de/anomic/yacy/yacyCore.java | 9 +- source/de/anomic/yacy/yacyNewsDB.java | 12 +- source/de/anomic/yacy/yacyNewsPool.java | 4 +- yacy.init | 3 - 16 files changed, 302 insertions(+), 245 deletions(-) create mode 100644 source/de/anomic/plasma/plasmaSearchAPI.java diff --git a/htroot/IndexControlRWIs_p.html b/htroot/IndexControlRWIs_p.html index ee2ca7f5c..e292fd8d8 100644 --- a/htroot/IndexControlRWIs_p.html +++ b/htroot/IndexControlRWIs_p.html @@ -40,10 +40,10 @@ document type   - reference description - author - tags + title + creator + subject url emphasized image @@ -55,10 +55,10 @@   #[allurl]# - #[reference]# #[description]# - #[author]# - #[tag]# + #[title]# + #[creator]# + #[subject]# #[url]# #[emphasized]# #[image]# @@ -70,10 +70,10 @@ Selection - - - + + + diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index bbf12c3fe..9342d6ede 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -29,7 +29,6 @@ import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; -import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -43,13 +42,11 @@ import de.anomic.index.indexRWIRowEntry; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBitfield; import de.anomic.plasma.plasmaCondenser; +import de.anomic.plasma.plasmaSearchAPI; import de.anomic.plasma.plasmaSearchEvent; -import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchRankingProcess; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.urlPattern.abstractURLPattern; -import de.anomic.plasma.urlPattern.plasmaURLPattern; -import de.anomic.server.serverDate; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyClient; @@ -92,7 +89,7 @@ public class IndexControlRWIs_p { if (post.containsKey("keystringsearch")) { keyhash = plasmaCondenser.word2hash(keystring); prop.put("keyhash", keyhash); - final plasmaSearchRankingProcess ranking = genSearchresult(prop, sb, keyhash, null, sortorder, false); + final plasmaSearchRankingProcess ranking = plasmaSearchAPI.genSearchresult(prop, sb, keyhash, null, sortorder, false); if (ranking.filteredCount() == 0) { prop.put("searchresult", 1); prop.put("searchresult_word", keystring); @@ -103,7 +100,7 @@ public class IndexControlRWIs_p { if (keystring.length() == 0 || !plasmaCondenser.word2hash(keystring).equals(keyhash)) { prop.put("keystring", "<not possible to compute word from hash>"); } - final plasmaSearchRankingProcess ranking = genSearchresult(prop, sb, keyhash, null, sortorder, false); + final plasmaSearchRankingProcess ranking = plasmaSearchAPI.genSearchresult(prop, sb, keyhash, null, sortorder, false); if (ranking.filteredCount() == 0) { prop.put("searchresult", 2); prop.put("searchresult_wordhash", keyhash); @@ -160,10 +157,10 @@ public class IndexControlRWIs_p { if (keystring.length() == 0 || !plasmaCondenser.word2hash(keystring).equals(keyhash)) { prop.put("keystring", "<not possible to compute word from hash>"); } - kelondroBitfield flags = compileFlags(post); + kelondroBitfield flags = plasmaSearchAPI.compileFlags(post); int count = (post.get("lines", "all").equals("all")) ? -1 : post.getInt("lines", -1); - final plasmaSearchRankingProcess ranking = genSearchresult(prop, sb, keyhash, flags, sortorder, true); - genURLList(prop, keyhash, keystring, ranking, flags, count, sortorder); + final plasmaSearchRankingProcess ranking = plasmaSearchAPI.genSearchresult(prop, sb, keyhash, flags, sortorder, true); + plasmaSearchAPI.genURLList(prop, keyhash, keystring, ranking, flags, count, sortorder); } // transfer to other peer @@ -307,7 +304,7 @@ public class IndexControlRWIs_p { sb.wordIndex.removeEntries(keyhash, urlHashes); } - if (prop.getInt("searchresult", 0) == 3) listHosts(prop, keyhash); + if (prop.getInt("searchresult", 0) == 3) plasmaSearchAPI.listHosts(prop, keyhash); } @@ -317,161 +314,4 @@ public class IndexControlRWIs_p { return prop; } - private static kelondroBitfield compileFlags(serverObjects post) { - kelondroBitfield b = new kelondroBitfield(4); - if (post.get("allurl", "").equals("on")) return null; - if (post.get("flags") != null) { - if (post.get("flags","").length() == 0) return null; - return new kelondroBitfield(4, (String) post.get("flags")); - } - if (post.get("reference", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_description, true); - if (post.get("description", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_title, true); - if (post.get("author", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_creator, true); - if (post.get("tag", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_subject, true); - if (post.get("url", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_identifier, true); - if (post.get("emphasized", "").equals("on")) b.set(indexRWIEntry.flag_app_emphasized, true); - if (post.get("image", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasimage, true); - if (post.get("audio", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasaudio, true); - if (post.get("video", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasvideo, true); - if (post.get("app", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasapp, true); - if (post.get("indexof", "").equals("on")) b.set(plasmaCondenser.flag_cat_indexof, true); - return b; - } - - private static void listHosts(serverObjects prop, String startHash) { - // list known hosts - yacySeed seed; - int hc = 0; - prop.put("searchresult_keyhash", startHash); - if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) { - Iterator e = yacyCore.dhtAgent.getAcceptRemoteIndexSeeds(startHash); - while (e.hasNext()) { - seed = (yacySeed) e.next(); - if (seed != null) { - prop.put("searchresult_hosts_" + hc + "_hosthash", seed.hash); - prop.putHTML("searchresult_hosts_" + hc + "_hostname", seed.hash + " " + seed.get(yacySeed.NAME, "nameless")); - hc++; - } - } - prop.put("searchresult_hosts", hc); - } else { - prop.put("searchresult_hosts", "0"); - } - } - - private static plasmaSearchRankingProcess genSearchresult(serverObjects prop, plasmaSwitchboard sb, String keyhash, kelondroBitfield filter, int sortorder, boolean fetchURLs) { - plasmaSearchQuery query = new plasmaSearchQuery(keyhash, -1, sb.getRanking(), filter); - plasmaSearchRankingProcess ranked = new plasmaSearchRankingProcess(sb.wordIndex, query, sortorder, Integer.MAX_VALUE); - ranked.execQuery(fetchURLs); - - if (ranked.filteredCount() == 0) { - prop.put("searchresult", 2); - prop.put("searchresult_wordhash", keyhash); - } else { - prop.put("searchresult", 3); - prop.put("searchresult_allurl", ranked.filteredCount()); - prop.put("searchresult_reference", ranked.flagCount()[indexRWIEntry.flag_app_dc_description]); - prop.put("searchresult_description", ranked.flagCount()[indexRWIEntry.flag_app_dc_title]); - prop.put("searchresult_author", ranked.flagCount()[indexRWIEntry.flag_app_dc_creator]); - prop.put("searchresult_tag", ranked.flagCount()[indexRWIEntry.flag_app_dc_subject]); - prop.put("searchresult_url", ranked.flagCount()[indexRWIEntry.flag_app_dc_identifier]); - prop.put("searchresult_emphasized", ranked.flagCount()[indexRWIEntry.flag_app_emphasized]); - prop.put("searchresult_image", ranked.flagCount()[plasmaCondenser.flag_cat_hasimage]); - prop.put("searchresult_audio", ranked.flagCount()[plasmaCondenser.flag_cat_hasaudio]); - prop.put("searchresult_video", ranked.flagCount()[plasmaCondenser.flag_cat_hasvideo]); - prop.put("searchresult_app", ranked.flagCount()[plasmaCondenser.flag_cat_hasapp]); - prop.put("searchresult_indexof", ranked.flagCount()[plasmaCondenser.flag_cat_indexof]); - } - return ranked; - } - - private static void genURLList(serverObjects prop, String keyhash, String keystring, plasmaSearchRankingProcess ranked, kelondroBitfield flags, int maxlines, int ordering) { - // search for a word hash and generate a list of url links - prop.put("genUrlList_keyHash", keyhash); - - if (ranked.filteredCount() == 0) { - prop.put("genUrlList", 1); - prop.put("genUrlList_count", 0); - prop.put("searchresult", 2); - } else { - prop.put("genUrlList", 2); - prop.put("searchresult", 3); - prop.put("genUrlList_flags", (flags == null) ? "" : flags.exportB64()); - prop.put("genUrlList_lines", maxlines); - prop.put("genUrlList_ordering", ordering); - int i = 0; - yacyURL url; - indexURLEntry entry; - String us; - long rn = -1; - while ((ranked.size() > 0) && ((entry = ranked.bestURL(false)) != null)) { - if ((entry == null) || (entry.comp() == null)) continue; - url = entry.comp().url(); - if (url == null) continue; - us = url.toNormalform(false, false); - if (rn == -1) rn = entry.ranking(); - prop.put("genUrlList_urlList_"+i+"_urlExists", "1"); - prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxCount", i); - prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", entry.word().urlHash()); - prop.putHTML("genUrlList_urlList_"+i+"_urlExists_keyString", keystring); - prop.put("genUrlList_urlList_"+i+"_urlExists_keyHash", keyhash); - prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlString", us); - prop.put("genUrlList_urlList_"+i+"_urlExists_urlStringShort", (us.length() > 40) ? (us.substring(0, 20) + "
" + us.substring(20, 40) + "...") : ((us.length() > 30) ? (us.substring(0, 20) + "
" + us.substring(20)) : us)); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_ranking", (entry.ranking() - rn)); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_domlength", yacyURL.domLengthEstimation(entry.hash())); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_ybr", plasmaSearchRankingProcess.ybr(entry.hash())); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_authority", ranked.getOrder().authority(entry.hash())); - prop.put("genUrlList_urlList_"+i+"_urlExists_date", serverDate.formatShortDay(new Date(entry.word().lastModified()))); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintitle", entry.word().wordsintitle()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintext", entry.word().wordsintext()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_phrasesintext", entry.word().phrasesintext()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_llocal", entry.word().llocal()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_lother", entry.word().lother()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_hitcount", entry.word().hitcount()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_worddistance", entry.word().worddistance()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_pos", entry.word().posintext()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_phrase", entry.word().posofphrase()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_posinphrase", entry.word().posinphrase()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_urlcomps", entry.word().urlcomps()); - prop.putNum("genUrlList_urlList_"+i+"_urlExists_urllength", entry.word().urllength()); - prop.put("genUrlList_urlList_"+i+"_urlExists_props", - ((entry.word().flags().get(plasmaCondenser.flag_cat_indexof)) ? "appears on index page, " : "") + - ((entry.word().flags().get(plasmaCondenser.flag_cat_hasimage)) ? "contains images, " : "") + - ((entry.word().flags().get(plasmaCondenser.flag_cat_hasaudio)) ? "contains audio, " : "") + - ((entry.word().flags().get(plasmaCondenser.flag_cat_hasvideo)) ? "contains video, " : "") + - ((entry.word().flags().get(plasmaCondenser.flag_cat_hasapp)) ? "contains applications, " : "") + - ((entry.word().flags().get(indexRWIEntry.flag_app_dc_identifier)) ? "appears in url, " : "") + - ((entry.word().flags().get(indexRWIEntry.flag_app_dc_title)) ? "appears in description, " : "") + - ((entry.word().flags().get(indexRWIEntry.flag_app_dc_creator)) ? "appears in author, " : "") + - ((entry.word().flags().get(indexRWIEntry.flag_app_dc_subject)) ? "appears in tags, " : "") + - ((entry.word().flags().get(indexRWIEntry.flag_app_dc_description)) ? "appears in reference, " : "") + - ((entry.word().flags().get(indexRWIEntry.flag_app_emphasized)) ? "appears emphasized, " : "") + - ((yacyURL.probablyRootURL(entry.word().urlHash())) ? "probably root url" : "") - ); - if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, url)) { - prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxChecked", "1"); - } - i++; - if ((maxlines >= 0) && (i >= maxlines)) break; - } - Iterator iter = ranked.miss(); // iterates url hash strings - while (iter.hasNext()) { - us = (String) iter.next(); - prop.put("genUrlList_urlList_"+i+"_urlExists", "0"); - prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxCount", i); - prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", us); - i++; - } - prop.put("genUrlList_urlList", i); - prop.putHTML("genUrlList_keyString", keystring); - prop.put("genUrlList_count", i); - putBlacklists(prop, listManager.getDirListing(listManager.listsPath)); - } - } - - private static void putBlacklists(serverObjects prop, String[] lists) { - prop.put("genUrlList_blacklists", lists.length); - for (int i=0; i doubles = index.removeDoubles(); + System.out.println(" -removed " + doubles.size() + " doubles- done."); if (doubles.size() > 0) { System.out.println("DEBUG " + tablefile + ": WARNING - EcoTable " + tablefile + " has " + doubles.size() + " doubles"); // from all the doubles take one, put it back to the index and remove the others from the file @@ -392,51 +401,57 @@ public class kelondroEcoTable implements kelondroIndex { assert file.size() == index.size() : "file.size() = " + file.size() + ", index.size() = " + index.size(); assert ((table == null) || (table.size() == index.size())); assert keepOrder == false; // this class cannot keep the order during a remove + assert key.length == rowdef.primaryKeyLength; int i = index.geti(key); if (i == -1) return null; // nothing to do // prepare result byte[] b = new byte[rowdef.objectsize]; byte[] p = new byte[rowdef.objectsize]; + int sb = index.size(); if (table == null) { - index.removei(key); - file.get(i, b, 0); - file.cleanLast(p, 0); - file.put(i, p, 0); - byte[] k = new byte[rowdef.primaryKeyLength]; - System.arraycopy(p, 0, k, 0, rowdef.primaryKeyLength); - index.puti(k, i); + if (i == index.size() - 1) { + index.removei(key); + file.clean(i, b, 0); + } else { + index.removei(key); + file.get(i, b, 0); + file.cleanLast(p, 0); + file.put(i, p, 0); + byte[] k = new byte[rowdef.primaryKeyLength]; + System.arraycopy(p, 0, k, 0, rowdef.primaryKeyLength); + index.puti(k, i); + } assert (file.size() == index.size()); - assert ((table == null) || (table.size() == index.size())); } else { + // get result value from the table copy, so we don't need to read it from the file kelondroRow.Entry v = table.get(i); - assert key.length == rowdef.primaryKeyLength; System.arraycopy(key, 0, b, 0, key.length); System.arraycopy(v.bytes(), 0, b, rowdef.primaryKeyLength, taildef.objectsize); + if (i == index.size() - 1) { // special handling if the entry is the last entry in the file index.removei(key); table.removeRow(i, false); file.clean(i); - assert (file.size() == index.size()); - assert ((table == null) || (table.size() == index.size())); } else { // switch values + index.removei(key); + kelondroRow.Entry te = table.removeOne(); table.set(i, te); file.cleanLast(p, 0); file.put(i, p, 0); kelondroRow.Entry lr = rowdef.newEntry(p); - - index.removei(key); index.puti(lr.getPrimaryKeyBytes(), i); - assert (file.size() == index.size()); - assert ((table == null) || (table.size() == index.size())) : "table.size() = " + table.size() + ", index.size() = " + index.size(); } + assert (file.size() == index.size()); + assert (table.size() == index.size()) : "table.size() = " + table.size() + ", index.size() = " + index.size(); } assert file.size() == index.size() : "file.size() = " + file.size() + ", index.size() = " + index.size(); assert ((table == null) || (table.size() == index.size())); + assert index.size() + 1 == sb : "index.size() = " + index.size() + ", sb = " + sb; return rowdef.newEntry(b); } @@ -448,7 +463,7 @@ public class kelondroEcoTable implements kelondroIndex { kelondroRow.Entry lr = rowdef.newEntry(le); int i = index.removei(lr.getPrimaryKeyBytes()); assert i >= 0; - table.removeRow(i, false); + if (table != null) table.removeOne(); assert file.size() == index.size() : "file.size() = " + file.size() + ", index.size() = " + index.size(); return lr; } diff --git a/source/de/anomic/kelondro/kelondroRotateIterator.java b/source/de/anomic/kelondro/kelondroRotateIterator.java index de0e0dc2c..d5d6c115e 100644 --- a/source/de/anomic/kelondro/kelondroRotateIterator.java +++ b/source/de/anomic/kelondro/kelondroRotateIterator.java @@ -30,12 +30,14 @@ public class kelondroRotateIterator implements kelondroCloneableIterator { kelondroCloneableIterator a, clone; Object modifier; + boolean nempty; public kelondroRotateIterator(kelondroCloneableIterator a, Object modifier) { // this works currently only for String-type key iterations this.a = a; this.modifier = modifier; this.clone = (kelondroCloneableIterator) a.clone(modifier); + this.nempty = this.clone.hasNext(); } public kelondroRotateIterator clone(Object modifier) { @@ -43,7 +45,7 @@ public class kelondroRotateIterator implements kelondroCloneableIterator { } public boolean hasNext() { - return true; + return this.nempty; } public E next() { @@ -52,6 +54,7 @@ public class kelondroRotateIterator implements kelondroCloneableIterator { // from the hasNext() method if (!(a.hasNext())) { a = (kelondroCloneableIterator) clone.clone(modifier); + assert a.hasNext(); } return a.next(); } diff --git a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java index e42487099..728b97fec 100644 --- a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java @@ -15,7 +15,7 @@ import de.anomic.plasma.plasmaSwitchboard; public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImporter { private File plasmaPath = null; - private HashSet importProfileHandleCache = new HashSet(); + private HashSet importProfileHandleCache = new HashSet(); private plasmaCrawlProfile importProfileDB; private plasmaCrawlNURL importNurlDB; private int importStartSize; @@ -129,8 +129,8 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor this.log.logInfo("Starting to import '" + this.importNurlDB.size() + "' entries not available in any stack."); } - // getting an interator and loop through the URL entries - Iterator entryIter = (stackTypes[stackType] == -1) ? this.importNurlDB.iterator(stackType) : null; + // getting an iterator and loop through the URL entries + Iterator entryIter = (stackTypes[stackType] == -1) ? this.importNurlDB.iterator(stackType) : null; while (true) { String nextHash = null; @@ -147,7 +147,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor if (!entryIter.hasNext()) break; this.urlCount++; - nextEntry = (plasmaCrawlEntry) entryIter.next(); + nextEntry = entryIter.next(); nextHash = nextEntry.url().hash(); } } catch (IOException e) { diff --git a/source/de/anomic/plasma/plasmaCrawlProfile.java b/source/de/anomic/plasma/plasmaCrawlProfile.java index 6c631541d..57cd12758 100644 --- a/source/de/anomic/plasma/plasmaCrawlProfile.java +++ b/source/de/anomic/plasma/plasmaCrawlProfile.java @@ -70,7 +70,7 @@ public class plasmaCrawlProfile { this.profileTableFile = file; this.preloadTime = preloadTime; profileTableFile.getParentFile().mkdirs(); - kelondroDyn dyn = new kelondroDyn(profileTableFile, true, true, preloadTime, yacySeedDB.commonHashLength, 2000, '#', kelondroNaturalOrder.naturalOrder, true, false, true); + kelondroDyn dyn = new kelondroDyn(profileTableFile, true, true, preloadTime, yacySeedDB.commonHashLength, 2000, '#', kelondroNaturalOrder.naturalOrder, false, false, true); profileTable = new kelondroMapObjects(dyn, 500); } @@ -79,7 +79,7 @@ public class plasmaCrawlProfile { if (profileTable != null) profileTable.close(); if (!(profileTableFile.delete())) throw new RuntimeException("cannot delete crawl profile database"); profileTableFile.getParentFile().mkdirs(); - kelondroDyn dyn = new kelondroDyn(profileTableFile, true, true, preloadTime, yacySeedDB.commonHashLength, 2000, '#', kelondroNaturalOrder.naturalOrder, true, false, true); + kelondroDyn dyn = new kelondroDyn(profileTableFile, true, true, preloadTime, yacySeedDB.commonHashLength, 2000, '#', kelondroNaturalOrder.naturalOrder, false, false, true); profileTable = new kelondroMapObjects(dyn, 500); } diff --git a/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java b/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java index 14a873dba..b74bff761 100644 --- a/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java +++ b/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java @@ -72,7 +72,7 @@ public class plasmaCrawlRobotsTxt { this.robotsTableFile = robotsTableFile; this.preloadTime = preloadTime; robotsTableFile.getParentFile().mkdirs(); - robotsTable = new kelondroMapObjects(new kelondroDyn(robotsTableFile, true, true, preloadTime, 256, 512, '_', kelondroNaturalOrder.naturalOrder, true, false, true), 100); + robotsTable = new kelondroMapObjects(new kelondroDyn(robotsTableFile, true, true, preloadTime, 256, 512, '_', kelondroNaturalOrder.naturalOrder, false, false, true), 100); } private void resetDatabase() { @@ -80,7 +80,7 @@ public class plasmaCrawlRobotsTxt { if (robotsTable != null) robotsTable.close(); if (!(robotsTableFile.delete())) throw new RuntimeException("cannot delete robots.txt database"); robotsTableFile.getParentFile().mkdirs(); - robotsTable = new kelondroMapObjects(new kelondroDyn(robotsTableFile, true, true, preloadTime, 256, 512, '_', kelondroNaturalOrder.naturalOrder, true, false, true), 100); + robotsTable = new kelondroMapObjects(new kelondroDyn(robotsTableFile, true, true, preloadTime, 256, 512, '_', kelondroNaturalOrder.naturalOrder, false, false, true), 100); } public void close() { @@ -103,7 +103,7 @@ public class plasmaCrawlRobotsTxt { public Entry getEntry(String hostName) { try { - Map record = this.robotsTable.getMap(hostName); + Map record = this.robotsTable.getMap(hostName); if (record == null) return null; return new Entry(hostName, record); } catch (kelondroException e) { @@ -114,14 +114,16 @@ public class plasmaCrawlRobotsTxt { public Entry addEntry( String hostName, - ArrayList disallowPathList, + ArrayList disallowPathList, Date loadedDate, Date modDate, String eTag, String sitemap, Integer crawlDelay ) { - Entry entry = new Entry(hostName,disallowPathList,loadedDate,modDate,eTag,sitemap,crawlDelay); + Entry entry = new Entry( + hostName, disallowPathList, loadedDate, modDate, + eTag, sitemap, crawlDelay); addEntry(entry); return entry; } @@ -129,7 +131,7 @@ public class plasmaCrawlRobotsTxt { public String addEntry(Entry entry) { // writes a new page and returns key try { - this.robotsTable.set(entry.hostName,entry.mem); + this.robotsTable.set(entry.hostName, entry.mem); return entry.hostName; } catch (IOException e) { return null; @@ -145,16 +147,16 @@ public class plasmaCrawlRobotsTxt { public static final String CRAWL_DELAY = "crawlDelay"; // this is a simple record structure that hold all properties of a single crawl start - Map mem; - private LinkedList disallowPathList; + Map mem; + private LinkedList disallowPathList; String hostName; - public Entry(String hostName, Map mem) { + public Entry(String hostName, Map mem) { this.hostName = hostName.toLowerCase(); this.mem = mem; if (this.mem.containsKey(DISALLOW_PATH_LIST)) { - this.disallowPathList = new LinkedList(); + this.disallowPathList = new LinkedList(); String csPl = (String) this.mem.get(DISALLOW_PATH_LIST); if (csPl.length() > 0){ String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR); @@ -163,13 +165,13 @@ public class plasmaCrawlRobotsTxt { } } } else { - this.disallowPathList = new LinkedList(); + this.disallowPathList = new LinkedList(); } } public Entry( String hostName, - ArrayList disallowPathList, + ArrayList disallowPathList, Date loadedDate, Date modDate, String eTag, @@ -179,9 +181,9 @@ public class plasmaCrawlRobotsTxt { if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException("The hostname is missing"); this.hostName = hostName.trim().toLowerCase(); - this.disallowPathList = new LinkedList(); + this.disallowPathList = new LinkedList(); - this.mem = new HashMap(5); + this.mem = new HashMap(5); if (loadedDate != null) this.mem.put(LOADED_DATE,Long.toString(loadedDate.getTime())); if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime())); if (eTag != null) this.mem.put(ETAG,eTag); @@ -259,9 +261,9 @@ public class plasmaCrawlRobotsTxt { else path = path.replaceAll(ROBOTS_DB_PATH_SEPARATOR,"%3B"); - Iterator pathIter = this.disallowPathList.iterator(); + Iterator pathIter = this.disallowPathList.iterator(); while (pathIter.hasNext()) { - String nextPath = (String) pathIter.next(); + String nextPath = pathIter.next(); // allow rule if (nextPath.startsWith("!") && nextPath.length() > 1 && path.startsWith(nextPath.substring(1))) { return false; diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 39cd9dfbc..8e6d63bbf 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -93,7 +93,7 @@ import de.anomic.yacy.yacyURL; public final class plasmaHTCache { - public static final String DB_NAME = "responseHeader1.db"; + public static final String DB_NAME = "responseHeader2.db"; private static final int stackLimit = 150; // if we exceed that limit, we do not check idle public static final long oneday = 1000 * 60 * 60 * 24; // milliseconds of a day @@ -307,7 +307,7 @@ public final class plasmaHTCache { private static void openResponseHeaderDB(long preloadTime) { // open the response header database File dbfile = new File(cachePath, DB_NAME); - responseHeaderDB = new kelondroMapObjects(new kelondroDyn(dbfile, true, true, preloadTime, yacySeedDB.commonHashLength, 150, '#', kelondroBase64Order.enhancedCoder, true, false, true), 500); + responseHeaderDB = new kelondroMapObjects(new kelondroDyn(dbfile, true, true, preloadTime, yacySeedDB.commonHashLength, 150, '#', kelondroBase64Order.enhancedCoder, false, false, true), 500); } private static void deleteOldHTCache(File directory) { diff --git a/source/de/anomic/plasma/plasmaSearchAPI.java b/source/de/anomic/plasma/plasmaSearchAPI.java new file mode 100644 index 000000000..ea04217d5 --- /dev/null +++ b/source/de/anomic/plasma/plasmaSearchAPI.java @@ -0,0 +1,206 @@ +// plasmaSearchAPI.java +// ----------------------- +// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 2008 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2007-11-14 01:15:28 +0000 (Mi, 14 Nov 2007) $ +// $LastChangedRevision: 4216 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +package de.anomic.plasma; + +import java.util.Date; +import java.util.Iterator; + +import de.anomic.data.listManager; +import de.anomic.index.indexRWIEntry; +import de.anomic.index.indexURLEntry; +import de.anomic.kelondro.kelondroBitfield; +import de.anomic.plasma.urlPattern.plasmaURLPattern; +import de.anomic.server.serverDate; +import de.anomic.server.serverObjects; +import de.anomic.yacy.yacyCore; +import de.anomic.yacy.yacySeed; +import de.anomic.yacy.yacyURL; + +public class plasmaSearchAPI { + // collection of static methods for a search servlet. Exists only to prevent that the same processes are defined more than once. + + + public static kelondroBitfield compileFlags(serverObjects post) { + kelondroBitfield b = new kelondroBitfield(4); + if (post.get("allurl", "").equals("on")) return null; + if (post.get("flags") != null) { + if (post.get("flags","").length() == 0) return null; + return new kelondroBitfield(4, (String) post.get("flags")); + } + if (post.get("description", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_description, true); + if (post.get("title", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_title, true); + if (post.get("creator", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_creator, true); + if (post.get("subject", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_subject, true); + if (post.get("url", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_identifier, true); + if (post.get("emphasized", "").equals("on")) b.set(indexRWIEntry.flag_app_emphasized, true); + if (post.get("image", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasimage, true); + if (post.get("audio", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasaudio, true); + if (post.get("video", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasvideo, true); + if (post.get("app", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasapp, true); + if (post.get("indexof", "").equals("on")) b.set(plasmaCondenser.flag_cat_indexof, true); + return b; + } + + public static void listHosts(serverObjects prop, String startHash) { + // list known hosts + yacySeed seed; + int hc = 0; + prop.put("searchresult_keyhash", startHash); + if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) { + Iterator e = yacyCore.dhtAgent.getAcceptRemoteIndexSeeds(startHash); + while (e.hasNext()) { + seed = (yacySeed) e.next(); + if (seed != null) { + prop.put("searchresult_hosts_" + hc + "_hosthash", seed.hash); + prop.putHTML("searchresult_hosts_" + hc + "_hostname", seed.hash + " " + seed.get(yacySeed.NAME, "nameless")); + hc++; + } + } + prop.put("searchresult_hosts", hc); + } else { + prop.put("searchresult_hosts", "0"); + } + } + + public static plasmaSearchRankingProcess genSearchresult(serverObjects prop, plasmaSwitchboard sb, String keyhash, kelondroBitfield filter, int sortorder, boolean fetchURLs) { + plasmaSearchQuery query = new plasmaSearchQuery(keyhash, -1, sb.getRanking(), filter); + plasmaSearchRankingProcess ranked = new plasmaSearchRankingProcess(sb.wordIndex, query, sortorder, Integer.MAX_VALUE); + ranked.execQuery(fetchURLs); + + if (ranked.filteredCount() == 0) { + prop.put("searchresult", 2); + prop.put("searchresult_wordhash", keyhash); + } else { + prop.put("searchresult", 3); + prop.put("searchresult_allurl", ranked.filteredCount()); + prop.put("searchresult_description", ranked.flagCount()[indexRWIEntry.flag_app_dc_description]); + prop.put("searchresult_title", ranked.flagCount()[indexRWIEntry.flag_app_dc_title]); + prop.put("searchresult_creator", ranked.flagCount()[indexRWIEntry.flag_app_dc_creator]); + prop.put("searchresult_subject", ranked.flagCount()[indexRWIEntry.flag_app_dc_subject]); + prop.put("searchresult_url", ranked.flagCount()[indexRWIEntry.flag_app_dc_identifier]); + prop.put("searchresult_emphasized", ranked.flagCount()[indexRWIEntry.flag_app_emphasized]); + prop.put("searchresult_image", ranked.flagCount()[plasmaCondenser.flag_cat_hasimage]); + prop.put("searchresult_audio", ranked.flagCount()[plasmaCondenser.flag_cat_hasaudio]); + prop.put("searchresult_video", ranked.flagCount()[plasmaCondenser.flag_cat_hasvideo]); + prop.put("searchresult_app", ranked.flagCount()[plasmaCondenser.flag_cat_hasapp]); + prop.put("searchresult_indexof", ranked.flagCount()[plasmaCondenser.flag_cat_indexof]); + } + return ranked; + } + + public static void genURLList(serverObjects prop, String keyhash, String keystring, plasmaSearchRankingProcess ranked, kelondroBitfield flags, int maxlines, int ordering) { + // search for a word hash and generate a list of url links + prop.put("genUrlList_keyHash", keyhash); + + if (ranked.filteredCount() == 0) { + prop.put("genUrlList", 1); + prop.put("genUrlList_count", 0); + prop.put("searchresult", 2); + } else { + prop.put("genUrlList", 2); + prop.put("searchresult", 3); + prop.put("genUrlList_flags", (flags == null) ? "" : flags.exportB64()); + prop.put("genUrlList_lines", maxlines); + prop.put("genUrlList_ordering", ordering); + int i = 0; + yacyURL url; + indexURLEntry entry; + String us; + long rn = -1; + while ((ranked.size() > 0) && ((entry = ranked.bestURL(false)) != null)) { + if ((entry == null) || (entry.comp() == null)) continue; + url = entry.comp().url(); + if (url == null) continue; + us = url.toNormalform(false, false); + if (rn == -1) rn = entry.ranking(); + prop.put("genUrlList_urlList_"+i+"_urlExists", "1"); + prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxCount", i); + prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", entry.word().urlHash()); + prop.putHTML("genUrlList_urlList_"+i+"_urlExists_keyString", keystring); + prop.put("genUrlList_urlList_"+i+"_urlExists_keyHash", keyhash); + prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlString", us); + prop.put("genUrlList_urlList_"+i+"_urlExists_urlStringShort", (us.length() > 40) ? (us.substring(0, 20) + "
" + us.substring(20, 40) + "...") : ((us.length() > 30) ? (us.substring(0, 20) + "
" + us.substring(20)) : us)); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_ranking", (entry.ranking() - rn)); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_domlength", yacyURL.domLengthEstimation(entry.hash())); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_ybr", plasmaSearchRankingProcess.ybr(entry.hash())); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_authority", ranked.getOrder().authority(entry.hash())); + prop.put("genUrlList_urlList_"+i+"_urlExists_date", serverDate.formatShortDay(new Date(entry.word().lastModified()))); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintitle", entry.word().wordsintitle()); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintext", entry.word().wordsintext()); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_phrasesintext", entry.word().phrasesintext()); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_llocal", entry.word().llocal()); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_lother", entry.word().lother()); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_hitcount", entry.word().hitcount()); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_worddistance", entry.word().worddistance()); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_pos", entry.word().posintext()); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_phrase", entry.word().posofphrase()); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_posinphrase", entry.word().posinphrase()); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_urlcomps", entry.word().urlcomps()); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_urllength", entry.word().urllength()); + prop.put("genUrlList_urlList_"+i+"_urlExists_props", + ((entry.word().flags().get(plasmaCondenser.flag_cat_indexof)) ? "appears on index page, " : "") + + ((entry.word().flags().get(plasmaCondenser.flag_cat_hasimage)) ? "contains images, " : "") + + ((entry.word().flags().get(plasmaCondenser.flag_cat_hasaudio)) ? "contains audio, " : "") + + ((entry.word().flags().get(plasmaCondenser.flag_cat_hasvideo)) ? "contains video, " : "") + + ((entry.word().flags().get(plasmaCondenser.flag_cat_hasapp)) ? "contains applications, " : "") + + ((entry.word().flags().get(indexRWIEntry.flag_app_dc_identifier)) ? "appears in url, " : "") + + ((entry.word().flags().get(indexRWIEntry.flag_app_dc_title)) ? "appears in title, " : "") + + ((entry.word().flags().get(indexRWIEntry.flag_app_dc_creator)) ? "appears in author, " : "") + + ((entry.word().flags().get(indexRWIEntry.flag_app_dc_subject)) ? "appears in subject, " : "") + + ((entry.word().flags().get(indexRWIEntry.flag_app_dc_description)) ? "appears in description, " : "") + + ((entry.word().flags().get(indexRWIEntry.flag_app_emphasized)) ? "appears emphasized, " : "") + + ((yacyURL.probablyRootURL(entry.word().urlHash())) ? "probably root url" : "") + ); + if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, url)) { + prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxChecked", "1"); + } + i++; + if ((maxlines >= 0) && (i >= maxlines)) break; + } + Iterator iter = ranked.miss(); // iterates url hash strings + while (iter.hasNext()) { + us = (String) iter.next(); + prop.put("genUrlList_urlList_"+i+"_urlExists", "0"); + prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxCount", i); + prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", us); + i++; + } + prop.put("genUrlList_urlList", i); + prop.putHTML("genUrlList_keyString", keystring); + prop.put("genUrlList_count", i); + putBlacklists(prop, listManager.getDirListing(listManager.listsPath)); + } + } + + public static void putBlacklists(serverObjects prop, String[] lists) { + prop.put("genUrlList_blacklists", lists.length); + for (int i=0; ipublic static final String DBFILE_CRAWL_ROBOTS = "crawlRobotsTxt.db"

*

Name of the file containing the database holding all robots.txt-entries of the lately crawled domains

* * @see plasmaSwitchboard#DBPATH for the folder this file lies in */ - public static final String DBFILE_CRAWL_ROBOTS = "crawlRobotsTxt.db"; + public static final String DBFILE_CRAWL_ROBOTS = "crawlRobotsTxt1.db"; /** *

public static final String DBFILE_USER = "DATA/SETTINGS/user.db"

*

Path to the user-DB, beginning from the YaCy-installation's top-folder. It holds all rights the created diff --git a/source/de/anomic/yacy/yacyCore.java b/source/de/anomic/yacy/yacyCore.java index 6f29c20ec..e763ea8dd 100644 --- a/source/de/anomic/yacy/yacyCore.java +++ b/source/de/anomic/yacy/yacyCore.java @@ -134,14 +134,13 @@ public class yacyCore { long memDHT_time = Long.parseLong(switchboard.getConfig("ramCacheDHT_time", "1000")); seedDB = new yacySeedDB( sb, - new File(yacyDBPath, "seed1.new.db"), - new File(yacyDBPath, "seed1.old.db"), - new File(yacyDBPath, "seed1.pot.db"), + new File(yacyDBPath, "seed2.new.db"), + new File(yacyDBPath, "seed2.old.db"), + new File(yacyDBPath, "seed2.pot.db"), memDHT_time); // create or init news database - long memNews_time = Long.parseLong(switchboard.getConfig("ramCacheNews_time", "1000")); - newsPool = new yacyNewsPool(yacyDBPath, memNews_time); + newsPool = new yacyNewsPool(yacyDBPath); loadSeedUploadMethods(); diff --git a/source/de/anomic/yacy/yacyNewsDB.java b/source/de/anomic/yacy/yacyNewsDB.java index 2fe9babb4..a36957292 100644 --- a/source/de/anomic/yacy/yacyNewsDB.java +++ b/source/de/anomic/yacy/yacyNewsDB.java @@ -50,30 +50,28 @@ import java.io.UnsupportedEncodingException; import java.util.Iterator; import de.anomic.kelondro.kelondroBase64Order; -import de.anomic.kelondro.kelondroCache; +import de.anomic.kelondro.kelondroEcoTable; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroIndex; import de.anomic.kelondro.kelondroRow; -import de.anomic.kelondro.kelondroTree; import de.anomic.server.serverCodings; import de.anomic.server.serverDate; public class yacyNewsDB { private File path; - private long preloadTime; protected kelondroIndex news; - public yacyNewsDB(File path, long preloadTime) { + public yacyNewsDB(File path) { this.path = path; - this.preloadTime = preloadTime; - this.news = new kelondroCache(kelondroTree.open(path, true, preloadTime, yacyNewsRecord.rowdef)); + this.news = new kelondroEcoTable(path, yacyNewsRecord.rowdef, kelondroEcoTable.tailCacheUsageAuto, 10, 0); + //this.news = new kelondroCache(kelondroTree.open(path, true, preloadTime, yacyNewsRecord.rowdef)); } private void resetDB() { try {close();} catch (Exception e) {} if (path.exists()) path.delete(); - this.news = new kelondroCache(kelondroTree.open(path, true, preloadTime, yacyNewsRecord.rowdef)); + this.news = new kelondroEcoTable(path, yacyNewsRecord.rowdef, kelondroEcoTable.tailCacheUsageAuto, 10, 0); } public void close() { diff --git a/source/de/anomic/yacy/yacyNewsPool.java b/source/de/anomic/yacy/yacyNewsPool.java index 98d6a8da6..e4f99dc77 100644 --- a/source/de/anomic/yacy/yacyNewsPool.java +++ b/source/de/anomic/yacy/yacyNewsPool.java @@ -265,8 +265,8 @@ public class yacyNewsPool { private int maxDistribution; - public yacyNewsPool(File yacyDBPath, long preloadTime) { - newsDB = new yacyNewsDB(new File(yacyDBPath, "news1.db"), preloadTime); + public yacyNewsPool(File yacyDBPath) { + newsDB = new yacyNewsDB(new File(yacyDBPath, "news2.db")); outgoingNews = new yacyNewsQueue(new File(yacyDBPath, "newsOut1.stack"), newsDB); publishedNews = new yacyNewsQueue(new File(yacyDBPath, "newsPublished1.stack"), newsDB); incomingNews = new yacyNewsQueue(new File(yacyDBPath, "newsIn1.stack"), newsDB); diff --git a/yacy.init b/yacy.init index 384b5b38e..47deece72 100644 --- a/yacy.init +++ b/yacy.init @@ -627,9 +627,6 @@ ramCacheWiki_time = 500 # ram cache for blog.db ramCacheBlog_time = 500 -# ram cache for news1.db -ramCacheNews_time = 1000 - # ram cache for robotsTxt.db ramCacheRobots_time = 0