diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 10f182b55..415cfc12f 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -62,6 +62,7 @@ import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaURL; import de.anomic.plasma.plasmaWordIndexEntity; import de.anomic.plasma.plasmaWordIndexEntry; +import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyClient; @@ -255,12 +256,12 @@ public class IndexControl_p { } prop.put("urlstring", ""); prop.put("urlhash", ""); - plasmaWordIndexEntity[] indexes = new plasmaWordIndexEntity[1]; + plasmaWordIndexEntryContainer[] indexes = new plasmaWordIndexEntryContainer[1]; String result; long starttime = System.currentTimeMillis(); - indexes[0] = switchboard.wordIndex.getEntity(keyhash, true, -1); + indexes[0] = switchboard.wordIndex.getContainer(keyhash, true, -1); // built urlCache - Iterator urlIter = indexes[0].elements(true); + Iterator urlIter = indexes[0].entries(); HashMap knownURLs = new HashMap(); HashSet unknownURLEntries = new HashSet(); plasmaWordIndexEntry indexEntry; @@ -282,9 +283,7 @@ public class IndexControl_p { // now delete all entries that have no url entry Iterator hashIter = unknownURLEntries.iterator(); while (hashIter.hasNext()) { - try { - indexes[0].removeEntry((String) hashIter.next(), false); - } catch (IOException e) {} + indexes[0].remove((String) hashIter.next()); } // use whats remaining String gzipBody = switchboard.getConfig("indexControl.gzipBody","false"); @@ -296,7 +295,8 @@ public class IndexControl_p { "true".equalsIgnoreCase(gzipBody), timeout); prop.put("result", (result == null) ? ("Successfully transferred " + indexes[0].size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result); - try {indexes[0].close();} catch (IOException e) {} + indexes[0] = null; + indexes = null; } // generate list @@ -431,15 +431,15 @@ public class IndexControl_p { public static String genUrlList(plasmaSwitchboard switchboard, String keyhash, String keystring) { // search for a word hash and generate a list of url links - plasmaWordIndexEntity index = null; + plasmaWordIndexEntryContainer index = null; try { - index = switchboard.wordIndex.getEntity(keyhash, true, -1); + index = switchboard.wordIndex.getContainer(keyhash, true, -1); final StringBuffer result = new StringBuffer(1024); if (index.size() == 0) { result.append("No URL entries related to this word hash ").append(keyhash).append("."); } else { - final Iterator en = index.elements(true); + final Iterator en = index.entries(); result.append("URL entries related to this word hash ").append(keyhash).append("

"); result.append("
"); String us; @@ -497,13 +497,12 @@ public class IndexControl_p { .append("for every resolveable and deleted URL reference, delete the same reference at every other word where the reference exists (very extensive, but prevents further unresolved references)") .append("

"); } - index.close(); index = null; return result.toString(); } catch (IOException e) { return ""; } finally { - if (index != null) try { index.close(); index = null; } catch (Exception e) {}; + if (index != null) index = null; } } diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java index c50f10593..7bbee28dd 100644 --- a/htroot/htdocsdefault/dir.java +++ b/htroot/htdocsdefault/dir.java @@ -463,7 +463,7 @@ public class dir { "AAAAAAAAAAAA", /*referrer*/ 0, /*copycount*/ false, /*localneed*/ - condenser.RESULT_INFORMATION_VALUE, + condenser.RESULT_WORD_ENTROPHY, "**", /*language*/ plasmaWordIndexEntry.DT_SHARE, /*doctype*/ phrase.length(), /*size*/ diff --git a/htroot/index.java b/htroot/index.java index d10fb0df1..4aad3693c 100644 --- a/htroot/index.java +++ b/htroot/index.java @@ -126,7 +126,12 @@ public class index { // SEARCH // process search words - final String querystring = post.get("search", ""); + int maxDistance = Integer.MAX_VALUE; + String querystring = post.get("search", "").trim(); + if ((querystring.charAt(0) == '"') && (querystring.charAt(querystring.length() - 1) == '"')) { + querystring = querystring.substring(1, querystring.length() - 1).trim(); + maxDistance = 1; + } if (sb.facilityDB != null) try { sb.facilityDB.update("zeitgeist", querystring, post); } catch (Exception e) {} final TreeSet query = plasmaSearchQuery.cleanQuery(querystring); // filter out stopwords @@ -172,7 +177,7 @@ public class index { } // do the search - plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, new String[]{order1, order2, order3}, count, searchtime, urlmask, referer, + plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, maxDistance, new String[]{order1, order2, order3}, count, searchtime, urlmask, referer, ((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL, "", 20); final serverObjects prop = sb.searchFromLocal(thisSearch); diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 93c2940cf..cd15fa1ef 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -47,7 +47,6 @@ // javac -classpath .:../../Classes search.java // if the shell's current path is htroot/yacy -import java.io.IOException; import java.util.HashSet; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaCrawlLURL; @@ -81,6 +80,7 @@ public final class search { // final String fwden = post.get("fwden", ""); // forward deny, a list of seed hashes. They may NOT be target of forward hopping final long duetime= post.getLong("duetime", 3000); final int count = post.getInt("count", 10); // maximum number of wanted results + final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE); // final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers // Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time @@ -103,8 +103,8 @@ public final class search { } final long timestamp = System.currentTimeMillis(); - plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, new String[]{plasmaSearchQuery.ORDER_YBR, plasmaSearchQuery.ORDER_DATE, plasmaSearchQuery.ORDER_QUALITY}, - count, duetime, ".*"); + plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, maxdist, new String[]{plasmaSearchQuery.ORDER_YBR, plasmaSearchQuery.ORDER_DATE, plasmaSearchQuery.ORDER_QUALITY}, + count, duetime, ".*"); squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; serverObjects prop = new serverObjects(); @@ -114,11 +114,8 @@ public final class search { plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache); plasmaSearchResult acc = null; int idxc = 0; - try { - idxc = theSearch.localSearch(); - acc = theSearch.order(); - } catch (IOException e) { - } + idxc = theSearch.localSearch(); + acc = theSearch.order(); // result is a List of urlEntry elements if ((idxc == 0) || (acc == null)) { diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 38d3eaa67..dde0f89a7 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -75,6 +75,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen linkTags0.add("img"); linkTags0.add("base"); linkTags0.add("frame"); + linkTags0.add("meta"); linkTags1 = new TreeSet(insensitiveCollator); linkTags1.add("a"); @@ -88,6 +89,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen // class variables: collectors for links private HashMap anchors; private HashMap images; + private HashMap metas; private String title; //private String headline; private List[] headlines; @@ -101,6 +103,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen this.root = root; this.anchors = new HashMap(); this.images = new HashMap(); + this.metas = new HashMap(); this.title = ""; this.headlines = new ArrayList[4]; for (int i = 0; i < 4; i++) headlines[i] = new ArrayList(); @@ -193,7 +196,12 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen return null; } } - + + public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"'; + public static String[] urlComps(String normalizedURL) { + return normalizedURL.toLowerCase().split(splitrex); // word components of the url + } + private String absolutePath(String relativePath) { try { return urlNormalform(new URL(root, relativePath)); @@ -206,6 +214,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen if (tagname.equalsIgnoreCase("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt","")); if (tagname.equalsIgnoreCase("base")) try {root = new URL(tagopts.getProperty("href", ""));} catch (MalformedURLException e) {} if (tagname.equalsIgnoreCase("frame")) anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name","")); + if (tagname.equalsIgnoreCase("meta")) metas.put((tagopts.getProperty("name", "")).toLowerCase(), tagopts.getProperty("content","")); } public void scrapeTag1(String tagname, Properties tagopts, byte[] text) { @@ -252,10 +261,16 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen // construct a title string, even if the document has no title // if there is one, return it if (title.length() > 0) return title; + // othervise take any headline for (int i = 0; i < 4; i++) { if (headlines[i].size() > 0) return (String) headlines[i].get(0); } + + // take description tag + String s = getDescription(); + if (s.length() > 0) return s; + // extract headline from content if (content.length() > 80) return cleanLine(new String(content.getBytes(), 0, 80)); return cleanLine(content.trim().toString()); @@ -280,6 +295,45 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen return images; } + public Map getMetas() { + return metas; + } + + public String getDescription() { + String s = (String) metas.get("description"); + if (s == null) return ""; else return s; + } + + public String getContentType() { + String s = (String) metas.get("content-type"); + if (s == null) return ""; else return s; + } + + public String getCopyright() { + String s = (String) metas.get("copyright"); + if (s == null) return ""; else return s; + } + + public String[] getContentLanguages() { + String s = (String) metas.get("content-language"); + if (s == null) s = ""; + return s.split(" |,"); + } + + public String[] getKeywords() { + String s = (String) metas.get("keywords"); + if (s == null) s = ""; + if (s.length() == 0) { + return getTitle().toLowerCase().split(splitrex); + } else { + return s.split(" |,"); + } + } + + /* + * (non-Javadoc) + * @see de.anomic.htmlFilter.htmlFilterScraper#close() + */ public void close() { // free resources super.close(); @@ -298,6 +352,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen } System.out.println("ANCHORS :" + anchors.toString()); System.out.println("IMAGES :" + images.toString()); + System.out.println("METAS :" + metas.toString()); System.out.println("TEXT :" + new String(content.getBytes())); } diff --git a/source/de/anomic/kelondro/kelondroTree.java b/source/de/anomic/kelondro/kelondroTree.java index a34e23c59..96e5a25bb 100644 --- a/source/de/anomic/kelondro/kelondroTree.java +++ b/source/de/anomic/kelondro/kelondroTree.java @@ -850,11 +850,11 @@ public class kelondroTree extends kelondroRecords implements kelondroIndex { this.rot = rotating; ii = new nodeIterator(asc, rot, start); nextNode = (ii.hasNext()) ? (Node) ii.next() : null; - if (nextNode != null) { + if ((nextNode != null) && (nextNode.getKey() != null)) { int c = objectOrder.compare(firstKey, nextNode.getKey()); if ((c > 0) && (asc)) { // firstKey > nextNode.getKey() - log.logWarning("CORRECTING ITERATOR: firstKey=" + new String(firstKey) + ", nextNode=" + new String(nextNode.getKey())); + if (log != null) log.logWarning("CORRECTING ITERATOR: firstKey=" + new String(firstKey) + ", nextNode=" + new String(nextNode.getKey())); nextNode = (ii.hasNext()) ? (Node) ii.next() : null; } if ((c < 0) && (!(asc))) { diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index b912a4101..830fbb2a2 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -83,9 +83,6 @@ public final class plasmaCondenser { public int RESULT_NUMB_SENTENCES = -1; public int RESULT_DIFF_SENTENCES = -1; public int RESULT_SIMI_SENTENCES = -1; - public int RESULT_AVERAGE_WORD_OCC = -1; - public int RESULT_INFORMATION_VALUE = -1; - public plasmaCondenser(InputStream text) { this(text, 3, 2); @@ -357,8 +354,7 @@ public final class plasmaCondenser { this.RESULT_NUMB_SENTENCES = allsentencecounter; this.RESULT_DIFF_SENTENCES = sentenceHandleCount; this.RESULT_SIMI_SENTENCES = sentences.size(); - this.RESULT_AVERAGE_WORD_OCC = (words.size() == 0) ? 0 : (allwordcounter / words.size()); - this.RESULT_INFORMATION_VALUE = (allwordcounter == 0) ? 0 : (wordenum.count() * words.size() / allwordcounter / 16); + //this.RESULT_INFORMATION_VALUE = (allwordcounter == 0) ? 0 : (wordenum.count() * words.size() / allwordcounter / 16); } public void print() { diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 9bf2408a1..6b44693a1 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -176,7 +176,7 @@ public final class plasmaCrawlLURL extends plasmaURL { gcrawlResultStack.add(urlHash + initiatorHash + executorHash); } - public synchronized Entry getEntry(String hash, plasmaWordIndexEntry searchedWord) throws IOException { + public Entry getEntry(String hash, plasmaWordIndexEntry searchedWord) throws IOException { return new Entry(hash, searchedWord); } @@ -399,8 +399,16 @@ public final class plasmaCrawlLURL extends plasmaURL { private int size; private int wordCount; private String snippet; - private plasmaWordIndexEntry word; - + private plasmaWordIndexEntry word; // this is only used if the url is transported via remote search requests + + // more needed attributes: + // - author / copyright owner + // - keywords + // - phrasecount, total number of phrases + // - boolean: URL attributes + // - int: # of outlinks to same domain + // - int: # of outlinks to outside domain + public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) { // create new entry and store it into database this.urlHash = urlHash(url); diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 249213774..baf32b78e 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -63,7 +63,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { private plasmaWordIndex wordIndex; private plasmaCrawlLURL urlStore; private plasmaSnippetCache snippetCache; - private plasmaWordIndexEntity rcLocal, rcGlobal; // caches for results + private plasmaWordIndexEntryContainer rcLocal, rcGlobal; // caches for results private plasmaSearchProfile profileLocal, profileGlobal; private yacySearch[] searchThreads; @@ -73,8 +73,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable { this.query = query; this.urlStore = urlStore; this.snippetCache = snippetCache; - this.rcLocal = new plasmaWordIndexEntity(null); - this.rcGlobal = new plasmaWordIndexEntity(null); + this.rcLocal = new plasmaWordIndexEntryContainer(null); + this.rcGlobal = new plasmaWordIndexEntryContainer(null); if (query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) { this.profileLocal = new plasmaSearchProfile(4 * query.maximumTime / 10, query.wantedResults); this.profileGlobal = new plasmaSearchProfile(6 * query.maximumTime / 10, query.wantedResults); @@ -114,68 +114,56 @@ public final class plasmaSearchEvent extends Thread implements Runnable { int globalContributions = globalSearch(fetchpeers); log.logFine("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds"); - try { - // combine the result and order - plasmaSearchResult result = order(); - result.globalContributions = globalContributions; - result.localContributions = rcLocal.size(); - - // flush results in a separate thread - this.start(); // start to flush results - //serverInstantThread.oneTimeJob(this, "flushResults", log, 0); - - // clean up - if ((rcLocal != null) && (!(rcLocal.isTMPEntity()))) rcLocal.close(); - rcLocal = null; - - // return search result - log.logFine("SEARCHRESULT: " + profileLocal.reportToString()); - lastEvent = this; - return result; - } catch (IOException e) { - return null; - } + // combine the result and order + plasmaSearchResult result = order(); + result.globalContributions = globalContributions; + result.localContributions = rcLocal.size(); + + // flush results in a separate thread + this.start(); // start to flush results + //serverInstantThread.oneTimeJob(this, "flushResults", log, 0); + + // clean up + rcLocal = null; + + // return search result + log.logFine("SEARCHRESULT: " + profileLocal.reportToString()); + lastEvent = this; + return result; } else { - // do a local search - //long start = System.currentTimeMillis(); - try { - localSearch(); - plasmaSearchResult result = order(); - result.localContributions = rcLocal.size(); - - // clean up - if ((rcLocal != null) && (!(rcLocal.isTMPEntity()))) rcLocal.close(); - rcLocal = null; - - // return search result - log.logFine("SEARCHRESULT: " + profileLocal.reportToString()); - lastEvent = this; - return result; - } catch (IOException e) { - return null; - } + localSearch(); + plasmaSearchResult result = order(); + result.localContributions = rcLocal.size(); + + // clean up + rcLocal = null; + + // return search result + log.logFine("SEARCHRESULT: " + profileLocal.reportToString()); + lastEvent = this; + return result; } } - public int localSearch() throws IOException { + public int localSearch() { // search for the set of hashes and return an array of urlEntry elements // retrieve entities that belong to the hashes profileLocal.startTimer(); - Set entities = wordIndex.getEntities(query.queryHashes, true, true, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_COLLECTION)); - if (entities.size() < query.size()) entities = null; // prevent that only a subset is returned + Set containers = wordIndex.getContainers(query.queryHashes, true, true, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_COLLECTION)); + if (containers.size() < query.size()) containers = null; // prevent that only a subset is returned profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_COLLECTION); - profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_COLLECTION, (entities == null) ? 0 : entities.size()); + profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_COLLECTION, (containers == null) ? 0 : containers.size()); // since this is a conjunction we return an empty entity if any word is not known - if (entities == null) { - rcLocal = new plasmaWordIndexEntity(null); + if (containers == null) { + rcLocal = new plasmaWordIndexEntryContainer(null); return 0; } // join the result profileLocal.startTimer(); - rcLocal = plasmaWordIndexEntity.joinEntities(entities, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_JOIN)); + rcLocal = plasmaWordIndexEntryContainer.joinContainer(containers, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_JOIN), query.maxDistance); profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_JOIN); profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_JOIN, rcLocal.size()); @@ -190,7 +178,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { log.logFine("STARTING " + fetchpeers + " THREADS TO CATCH EACH " + profileGlobal.getTargetCount(plasmaSearchProfile.PROCESS_POSTSORT) + " URLs WITHIN " + (profileGlobal.duetime() / 1000) + " SECONDS"); long timeout = System.currentTimeMillis() + profileGlobal.duetime() + 4000; - searchThreads = yacySearch.searchHashes(query.queryHashes, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal); + searchThreads = yacySearch.searchHashes(query.queryHashes, query.maxDistance, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal); // wait until wanted delay passed or wanted result appeared while (System.currentTimeMillis() < timeout) { @@ -204,20 +192,20 @@ public final class plasmaSearchEvent extends Thread implements Runnable { return rcGlobal.size(); } - public plasmaSearchResult order() throws IOException { + public plasmaSearchResult order() { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime - plasmaWordIndexEntity searchResult = new plasmaWordIndexEntity(null); - searchResult.merge(rcLocal, -1); - searchResult.merge(rcGlobal, -1); + plasmaWordIndexEntryContainer searchResult = new plasmaWordIndexEntryContainer(null); + searchResult.add(rcLocal); + searchResult.add(rcGlobal); long preorderTime = profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_PRESORT); long postorderTime = profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_POSTSORT); profileLocal.startTimer(); plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query); - preorder.addEntity(searchResult, preorderTime); + preorder.addContainer(searchResult, preorderTime); profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_PRESORT); profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_PRESORT, rcLocal.size()); @@ -289,19 +277,13 @@ public final class plasmaSearchEvent extends Thread implements Runnable { Iterator hashi = query.queryHashes.iterator(); while (hashi.hasNext()) { wordHash = (String) hashi.next(); - Iterator i = rcGlobal.elements(true); - plasmaWordIndexEntry entry; - plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash, rcGlobal.size()); - while (i.hasNext()) { - entry = (plasmaWordIndexEntry) i.next(); - container.add(entry, System.currentTimeMillis()); - } - wordIndex.addEntries(container, true); - log.logFine("FLUSHED " + wordHash + ": " + container.size() + " url entries"); + rcGlobal.setWordHash(wordHash); + wordIndex.addEntries(rcGlobal, true); + log.logFine("FLUSHED " + wordHash + ": " + rcGlobal.size() + " url entries"); } // the rcGlobal was flushed, empty it count += rcGlobal.size(); - rcGlobal.deleteComplete(); + rcGlobal.clear(); } // wait a little bit before trying again try {Thread.sleep(3000);} catch (InterruptedException e) {} diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java index b8fac651a..90ebb0af6 100644 --- a/source/de/anomic/plasma/plasmaSearchPreOrder.java +++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java @@ -116,8 +116,8 @@ public final class plasmaSearchPreOrder { return (plasmaWordIndexEntry) pageAcc.remove(top); } - public void addEntity(plasmaWordIndexEntity entity, long maxTime) { - Iterator i = entity.elements(true); + public void addContainer(plasmaWordIndexEntryContainer container, long maxTime) { + Iterator i = container.entries(); long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; plasmaWordIndexEntry entry; while (i.hasNext()) { diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index 57a33dee0..b19eb2d23 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -72,12 +72,14 @@ public final class plasmaSearchQuery { public int domType; public String domGroupName; public int domMaxTargets; + public int maxDistance; - public plasmaSearchQuery(Set queryWords, + public plasmaSearchQuery(Set queryWords, int maxDistance, String[] order, int wantedResults, long maximumTime, String urlMask, String referrer, int domType, String domGroupName, int domMaxTargets) { this.queryWords = queryWords; + this.maxDistance = maxDistance; this.queryHashes = words2hashes(queryWords); this.order = order; this.wantedResults = wantedResults; @@ -89,9 +91,10 @@ public final class plasmaSearchQuery { this.domMaxTargets = domMaxTargets; } - public plasmaSearchQuery(Set queryHashes, + public plasmaSearchQuery(Set queryHashes, int maxDistance, String[] order, int wantedResults, long maximumTime, String urlMask) { this.queryWords = null; + this.maxDistance = maxDistance; this.queryHashes = queryHashes; this.order = order; this.wantedResults = wantedResults; diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java index 164805f6d..e56fcb530 100644 --- a/source/de/anomic/plasma/plasmaSearchResult.java +++ b/source/de/anomic/plasma/plasmaSearchResult.java @@ -54,11 +54,10 @@ import java.net.MalformedURLException; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.server.serverCodings; +import de.anomic.htmlFilter.htmlFilterContentScraper; public final class plasmaSearchResult { - public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"'; - private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic private ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects @@ -111,8 +110,8 @@ public final class plasmaSearchResult { URL url = page.url(); String descr = page.descr(); if ((url == null) || (descr == null)) return; - String[] urlcomps = url.toString().toLowerCase().split(splitrex); // word components of the url - String[] descrcomps = descr.toLowerCase().split(splitrex); // words in the description + String[] urlcomps = htmlFilterContentScraper.urlComps(url.toString()); // word components of the url + String[] descrcomps = descr.toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description // store everything Object[] resultVector = new Object[] {indexEntry, page, urlcomps, descrcomps}; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 31a04422b..f4b57214f 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1285,7 +1285,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser yacyCore.seedDB.mySeed.hash, referrerHash, 0, true, - condenser.RESULT_INFORMATION_VALUE, + condenser.RESULT_WORD_ENTROPHY, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()), (int) entry.size(), @@ -1313,15 +1313,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } else { HashMap urlCache = new HashMap(1); urlCache.put(newEntry.hash(),newEntry); - ArrayList tmpEntities = new ArrayList(condenser.RESULT_SIMI_WORDS); + ArrayList tmpContainers = new ArrayList(condenser.RESULT_SIMI_WORDS); String language = plasmaWordIndexEntry.language(entry.url()); char doctype = plasmaWordIndexEntry.docType(document.getMimeType()); - int quality = 0; - try { - quality = condenser.RESULT_INFORMATION_VALUE; - } catch (NumberFormatException e) { - System.out.println("INTERNAL ERROR WITH CONDENSER.INFORMATION_VALUE: " + e.toString() + ": in URL " + newEntry.url().toString()); - } + int urlLength = newEntry.url().toString().length(); + int urlComps = htmlFilterContentScraper.urlComps(newEntry.url().toString()).length; // iterate over all words Iterator i = condenser.words(); @@ -1332,8 +1328,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String word = (String) wentry.getKey(); wordStat = (plasmaCondenser.wordStatProp) wentry.getValue(); String wordHash = plasmaWordIndexEntry.word2hash(word); - plasmaWordIndexEntity wordIdxEntity = new plasmaWordIndexEntity(wordHash); + plasmaWordIndexEntryContainer wordIdxContainer = new plasmaWordIndexEntryContainer(wordHash); plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash, + urlLength, urlComps, wordStat.count, condenser.RESULT_SIMI_WORDS, condenser.RESULT_SIMI_SENTENCES, @@ -1344,26 +1341,25 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser newEntry.size(), docDate.getTime(), System.currentTimeMillis(), - quality, language, doctype, true); - wordIdxEntity.addEntry(wordIdxEntry); - tmpEntities.add(wordIdxEntity); + condenser.RESULT_WORD_ENTROPHY, + language, + doctype, + true); + wordIdxContainer.add(wordIdxEntry); + tmpContainers.add(wordIdxContainer); // wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry)); } //System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries"); words = condenser.RESULT_SIMI_WORDS; // transfering the index to the storage peer - String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntity[])tmpEntities.toArray(new plasmaWordIndexEntity[tmpEntities.size()]),urlCache,true,120000); + String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntryContainer[])tmpContainers.toArray(new plasmaWordIndexEntity[tmpContainers.size()]),urlCache,true,120000); if (error != null) { words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType())); } - // cleanup - for (int j=0; j < tmpEntities.size(); j++) { - plasmaWordIndexEntity tmpEntity = (plasmaWordIndexEntity) tmpEntities.get(j); - try { tmpEntity.close(); } catch (Exception e) {} - } + tmpContainers = null; } storageEndTime = System.currentTimeMillis(); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 2e6961953..e3814224d 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -56,6 +56,7 @@ import java.util.Set; import java.util.Date; import java.net.URL; +import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.server.logging.serverLog; @@ -136,16 +137,8 @@ public final class plasmaWordIndex { public int addPageIndex(URL url, String urlHash, Date urlModified, int size, plasmaCondenser condenser, String language, char doctype) { // this is called by the switchboard to put in a new page into the index - // use all the words in one condenser object to simultanous create index - // entries - // int age = microDateDays(urlModified); - int quality = 0; - try { - quality = condenser.RESULT_INFORMATION_VALUE; - } catch (NumberFormatException e) { - System.out.println("INTERNAL ERROR WITH CONDENSER.INFORMATION_VALUE: " + e.toString() + ": in URL " + url.toString()); - } - + // use all the words in one condenser object to simultanous create index entries + // iterate over all words Iterator i = condenser.words(); Map.Entry wentry; @@ -153,6 +146,9 @@ public final class plasmaWordIndex { plasmaWordIndexEntry ientry; plasmaCondenser.wordStatProp wprop; String wordHash; + int urlLength = url.toString().length(); + int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length; + while (i.hasNext()) { wentry = (Map.Entry) i.next(); word = (String) wentry.getKey(); @@ -160,6 +156,7 @@ public final class plasmaWordIndex { // if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c); wordHash = plasmaWordIndexEntry.word2hash(word); ientry = new plasmaWordIndexEntry(urlHash, + urlLength, urlComps, wprop.count, condenser.RESULT_SIMI_WORDS, condenser.RESULT_SIMI_SENTENCES, @@ -170,18 +167,54 @@ public final class plasmaWordIndex { size, urlModified.getTime(), System.currentTimeMillis(), - quality, language, doctype, true); + condenser.RESULT_WORD_ENTROPHY, + language, + doctype, + true); addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), ientry), false); } // System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + // condenser.getWords().size() + " words, flushed " + c + " entries"); return condenser.RESULT_SIMI_WORDS; } + + public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) { + return ramCache.getContainer(wordHash, deleteIfEmpty, maxTime); + } public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) { - return ramCache.getIndex(wordHash, deleteIfEmpty, maxTime); + return ramCache.getEntity(wordHash, deleteIfEmpty, maxTime); } + public Set getContainers(Set wordHashes, boolean deleteIfEmpty, boolean interruptIfEmpty, long maxTime) { + + // retrieve entities that belong to the hashes + HashSet containers = new HashSet(); + String singleHash; + plasmaWordIndexEntryContainer singleContainer; + Iterator i = wordHashes.iterator(); + long start = System.currentTimeMillis(); + long remaining; + while (i.hasNext()) { + // check time + remaining = maxTime - (System.currentTimeMillis() - start); + //if ((maxTime > 0) && (remaining <= 0)) break; + + // get next hash: + singleHash = (String) i.next(); + + // retrieve index + singleContainer = getContainer(singleHash, deleteIfEmpty, (maxTime < 0) ? -1 : remaining / (wordHashes.size() - containers.size())); + + // check result + if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashSet(); + + containers.add(singleContainer); + } + return containers; + } + + /* public Set getEntities(Set wordHashes, boolean deleteIfEmpty, boolean interruptIfEmpty, long maxTime) { // retrieve entities that belong to the hashes @@ -203,13 +236,14 @@ public final class plasmaWordIndex { singleEntity = getEntity(singleHash, deleteIfEmpty, (maxTime < 0) ? -1 : remaining / (wordHashes.size() - entities.size())); // check result - if (((singleEntity == null) || (singleEntity.size() == 0)) && (interruptIfEmpty)) return null; + if (((singleEntity == null) || (singleEntity.size() == 0)) && (interruptIfEmpty)) return new HashSet(); entities.add(singleEntity); } return entities; } - + */ + public int size() { return ramCache.size(); } diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java index 8572e7b40..e1b054255 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java @@ -203,7 +203,7 @@ public final class plasmaWordIndexAssortmentCluster { } public plasmaWordIndexEntryContainer removeFromAll(String wordHash, long maxTime) { - // collect all records from all the assortments and return them + // removes all records from all the assortments and return them plasmaWordIndexEntryContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash); long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; for (int i = 0; i < clusterCount; i++) { @@ -214,6 +214,18 @@ public final class plasmaWordIndexAssortmentCluster { return record; } + public plasmaWordIndexEntryContainer getFromAll(String wordHash, long maxTime) { + // collect all records from all the assortments and return them + plasmaWordIndexEntryContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash); + long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; + for (int i = 0; i < clusterCount; i++) { + buffer = assortments[i].get(wordHash); + if (buffer != null) record.add(buffer); + if (System.currentTimeMillis() > limitTime) break; + } + return record; + } + public Iterator hashConjunction(String startWordHash, boolean up, boolean rot) { HashSet iterators = new HashSet(); //if (rot) System.out.println("WARNING: kelondroMergeIterator does not work correctly when individual iterators rotate on their own!"); diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java index eaf57a520..4e506fe94 100644 --- a/source/de/anomic/plasma/plasmaWordIndexCache.java +++ b/source/de/anomic/plasma/plasmaWordIndexCache.java @@ -391,7 +391,18 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { } } - public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty, long maxTime) { + public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) { + long start = System.currentTimeMillis(); + if (maxTime > 0) maxTime = 8 * maxTime / 10; // reserve time for later adding to backend + plasmaWordIndexEntryContainer container = assortmentCluster.getFromAll(wordHash, maxTime); + if (container == null) { + container = new plasmaWordIndexEntryContainer(wordHash); + } + container.add(backend.getContainer(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : System.currentTimeMillis() - start)); + return container; + } + + public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) { // this possibly creates an index file in the back-end // the index file is opened and returned as entity object long start = System.currentTimeMillis(); @@ -406,7 +417,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { } } long r = maxTime - (System.currentTimeMillis() - start); - return backend.getIndex(wordHash, deleteIfEmpty, (r < 0) ? 0 : r); + return backend.getEntity(wordHash, deleteIfEmpty, (r < 0) ? 0 : r); } public long getUpdateTime(String wordHash) { diff --git a/source/de/anomic/plasma/plasmaWordIndexClassicDB.java b/source/de/anomic/plasma/plasmaWordIndexClassicDB.java index 5714f2038..42088db56 100644 --- a/source/de/anomic/plasma/plasmaWordIndexClassicDB.java +++ b/source/de/anomic/plasma/plasmaWordIndexClassicDB.java @@ -181,7 +181,24 @@ public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface { } } - public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty, long maxTime) { + public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) { + long start = System.currentTimeMillis(); + if (plasmaWordIndexEntity.wordHash2path(databaseRoot, wordHash).exists()) { + plasmaWordIndexEntity entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10); + plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash); + plasmaWordIndexEntry entry; + Iterator i = entity.elements(true); + while ((i.hasNext()) && ((maxTime < 0) || (System.currentTimeMillis() < start + maxTime))) { + entry = (plasmaWordIndexEntry) i.next(); + container.add(entry); + } + return container; + } else { + return new plasmaWordIndexEntryContainer(wordHash, 0); + } + } + + public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) { return new plasmaWordIndexEntity(databaseRoot, wordHash, deleteIfEmpty); } @@ -190,7 +207,6 @@ public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface { if (f.exists()) return f.lastModified(); else return -1; } - public void deleteIndex(String wordHash) { plasmaWordIndexEntity.removePlasmaIndex(databaseRoot, wordHash); } @@ -200,7 +216,7 @@ public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface { plasmaWordIndexEntity pi = null; int count = 0; try { - pi = getIndex(wordHash, true, -1); + pi = getEntity(wordHash, true, -1); for (int i = 0; i < urlHashes.length; i++) if (pi.removeEntry(urlHashes[i], deleteComplete)) count++; int size = pi.size(); diff --git a/source/de/anomic/plasma/plasmaWordIndexDistribution.java b/source/de/anomic/plasma/plasmaWordIndexDistribution.java index 4ed88dbdf..78e0dcceb 100644 --- a/source/de/anomic/plasma/plasmaWordIndexDistribution.java +++ b/source/de/anomic/plasma/plasmaWordIndexDistribution.java @@ -201,33 +201,33 @@ public final class plasmaWordIndexDistribution { // collect index String startPointHash = selectTransferStart(); log.logFine("Selected hash " + startPointHash + " as start point for index distribution, distance = " + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, startPointHash)); - Object[] selectResult = selectTransferIndexes(startPointHash, indexCount, this.maxOpenFiles4Distribution); - plasmaWordIndexEntity[] indexEntities = (plasmaWordIndexEntity[]) selectResult[0]; + Object[] selectResult = selectTransferContainers(startPointHash, indexCount, this.maxOpenFiles4Distribution); + plasmaWordIndexEntryContainer[] indexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0]; //Integer openedFiles = (Integer) selectResult[2]; HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry - if ((indexEntities == null) || (indexEntities.length == 0)) { + if ((indexContainers == null) || (indexContainers.length == 0)) { log.logFine("No index available for index transfer, hash start-point " + startPointHash); return -1; } // count the indexes again, can be smaller as expected indexCount = 0; - for (int i = 0; i < indexEntities.length; i++) { - indexCount += indexEntities[i].size(); + for (int i = 0; i < indexContainers.length; i++) { + indexCount += indexContainers[i].size(); } if (indexCount < 50) { log.logFine("Too few (" + indexCount + ") indexes selected for transfer."); - closeTransferIndexes (indexEntities); + closeTransferIndexes(indexContainers); return -1; // failed } // find start point for DHT-selection - String keyhash = indexEntities[indexEntities.length - 1].wordHash(); // DHT targets must have greater hashes + String keyhash = indexContainers[indexContainers.length - 1].wordHash(); // DHT targets must have greater hashes // find a list of DHT-peers yacySeed[] seeds = new yacySeed[peerCount + 10]; int hc0 = 0; - double ownDistance = Math.min(yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexEntities[0].wordHash()), - yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexEntities[indexEntities.length - 1].wordHash())); + double ownDistance = Math.min(yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexContainers[0].wordHash()), + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexContainers[indexContainers.length - 1].wordHash())); double maxDistance = Math.min(ownDistance, 0.4); synchronized (yacyCore.dhtAgent) { double avdist; @@ -239,8 +239,8 @@ public final class plasmaWordIndexDistribution { } seeds[hc0] = (yacySeed) e.nextElement(); if (seeds[hc0] != null) { - avdist = Math.max(yacyDHTAction.dhtDistance(seeds[hc0].hash, indexEntities[0].wordHash()), - yacyDHTAction.dhtDistance(seeds[hc0].hash, indexEntities[indexEntities.length - 1].wordHash())); + avdist = Math.max(yacyDHTAction.dhtDistance(seeds[hc0].hash, indexContainers[0].wordHash()), + yacyDHTAction.dhtDistance(seeds[hc0].hash, indexContainers[indexContainers.length - 1].wordHash())); if (avdist < maxDistance) { log.logInfo("Selected " + ((hc0 < peerCount) ? "primary" : "reserve") + " DHT target peer " + seeds[hc0].getName() + ":" + seeds[hc0].hash + ", distance = " + avdist); hc0++; @@ -252,7 +252,7 @@ public final class plasmaWordIndexDistribution { if (hc0 < peerCount) { log.logWarning("found not enough (" + hc0 + ") peers for distribution"); - closeTransferIndexes (indexEntities); + closeTransferIndexes(indexContainers); return -1; // failed } @@ -267,9 +267,9 @@ public final class plasmaWordIndexDistribution { return -1; // interrupted } start = System.currentTimeMillis(); - error = yacyClient.transferIndex(seeds[i], indexEntities, urlCache, this.gzipBody4Distribution, this.timeout4Distribution); + error = yacyClient.transferIndex(seeds[i], indexContainers, urlCache, this.gzipBody4Distribution, this.timeout4Distribution); if (error == null) { - log.logInfo("Index transfer of " + indexCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length - 1].wordHash() + "] to peer " + seeds[i].getName() + ":" + seeds[i].hash + " in " + ((System.currentTimeMillis() - start) / 1000) + log.logInfo("Index transfer of " + indexCount + " words [" + indexContainers[0].wordHash() + " .. " + indexContainers[indexContainers.length - 1].wordHash() + "] to peer " + seeds[i].getName() + ":" + seeds[i].hash + " in " + ((System.currentTimeMillis() - start) / 1000) + " seconds successfull (" + (1000 * indexCount / (System.currentTimeMillis() - start + 1)) + " words/s)"); peerNames += ", " + seeds[i].getName(); hc1++; @@ -286,8 +286,8 @@ public final class plasmaWordIndexDistribution { // success if (delete) { try { - if (deleteTransferIndexes(indexEntities)) { - log.logFine("Deleted all " + indexEntities.length + " transferred whole-word indexes locally"); + if (deleteTransferIndexes(indexContainers)) { + log.logFine("Deleted all " + indexContainers.length + " transferred whole-word indexes locally"); return indexCount; } else { log.logSevere("Deleted not all transferred whole-word indexes"); @@ -299,13 +299,13 @@ public final class plasmaWordIndexDistribution { } } else { // simply close the indexEntities - closeTransferIndexes (indexEntities); + closeTransferIndexes(indexContainers); } return indexCount; } else { log.logSevere("Index distribution failed. Too few peers (" + hc1 + ") received the index, not deleted locally."); // simply close the indexEntities - closeTransferIndexes (indexEntities); + closeTransferIndexes(indexContainers); return -1; } } @@ -322,15 +322,16 @@ public final class plasmaWordIndexDistribution { return startPointHash; } - Object[] /* of {plasmaWordIndexEntity[], HashMap(String, plasmaCrawlLURL.Entry)}*/ - selectTransferIndexes(String hash, int count, int maxOpenFiles) { + Object[] /* of {plasmaWordIndexEntryContainer[], HashMap(String, plasmaCrawlLURL.Entry)}*/ + selectTransferContainers(String hash, int count, int maxOpenFiles) { // the hash is a start hash from where the indexes are picked - ArrayList tmpEntities = new ArrayList(count); + ArrayList tmpContainers = new ArrayList(count); String nexthash = ""; try { int currOpenFiles = 0; Iterator wordHashIterator = this.wordIndex.wordHashes(hash, true, true); - plasmaWordIndexEntity indexEntity, tmpEntity; + plasmaWordIndexEntity indexEntity; + plasmaWordIndexEntryContainer indexContainer; Iterator urlIter; Iterator hashIter; plasmaWordIndexEntry indexEntry; @@ -343,56 +344,15 @@ public final class plasmaWordIndexDistribution { (wordHashIterator.hasNext()) && ((nexthash = (String) wordHashIterator.next()) != null) && (nexthash.trim().length() > 0) && - ((currOpenFiles == 0) || (yacyDHTAction.dhtDistance(nexthash, - ((plasmaWordIndexEntity)tmpEntities.get(0)).wordHash()) < 0.2)) + ((currOpenFiles == 0) || + (yacyDHTAction.dhtDistance(nexthash, ((plasmaWordIndexEntity)tmpContainers.get(0)).wordHash()) < 0.2)) ) { indexEntity = this.wordIndex.getEntity(nexthash, true, -1); if (indexEntity.size() == 0) { indexEntity.deleteComplete(); - } else if ((indexEntity.size() <= count)|| // if we havn't exceeded the limit - (Math.abs(indexEntity.size() - count) <= 10)){ // or there are only at most 10 entries left - // take the whole entity - try { - // fist check if we know all urls - urlIter = indexEntity.elements(true); - unknownURLEntries.clear(); - while (urlIter.hasNext()) { - indexEntry = (plasmaWordIndexEntry) urlIter.next(); - try { - lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), indexEntry); - if ((lurl == null) || (lurl.url() == null)) { - unknownURLEntries.add(indexEntry.getUrlHash()); - } else { - knownURLs.put(indexEntry.getUrlHash(), lurl); - } - } catch (IOException e) { - unknownURLEntries.add(indexEntry.getUrlHash()); - } - } - // now delete all entries that have no url entry - hashIter = unknownURLEntries.iterator(); - while (hashIter.hasNext()) { - String nextUrlHash = (String) hashIter.next(); - indexEntity.removeEntry(nextUrlHash, false); - this.urlPool.loadedURL.remove(nextUrlHash); - } - - if (indexEntity.size() == 0) { - indexEntity.deleteComplete(); - } else { - // use whats remaining - tmpEntities.add(indexEntity); - this.log.logFine("Selected whole index (" + indexEntity.size() + " URLs, " + unknownURLEntries.size() + " not bound) for word " + indexEntity.wordHash()); - count -= indexEntity.size(); - currOpenFiles++; - } - } catch (kelondroException e) { - this.log.logSevere("plasmaWordIndexDistribution/1: deleted DB for word " + indexEntity.wordHash(), e); - indexEntity.deleteComplete(); - } } else { // make an on-the-fly entity and insert values - tmpEntity = new plasmaWordIndexEntity(indexEntity.wordHash()); + indexContainer = new plasmaWordIndexEntryContainer(indexEntity.wordHash()); try { urlIter = indexEntity.elements(true); unknownURLEntries.clear(); @@ -404,7 +364,7 @@ public final class plasmaWordIndexDistribution { unknownURLEntries.add(indexEntry.getUrlHash()); } else { knownURLs.put(indexEntry.getUrlHash(), lurl); - tmpEntity.addEntry(indexEntry); + indexContainer.add(indexEntry); count--; } } catch (IOException e) { @@ -426,8 +386,8 @@ public final class plasmaWordIndexDistribution { } // use whats remaining - this.log.logFine("Selected partial index (" + tmpEntity.size() + " from " + indexEntity.size() +" URLs, " + unknownURLEntries.size() + " not bound) for word " + tmpEntity.wordHash()); - tmpEntities.add(tmpEntity); + this.log.logFine("Selected partial index (" + indexContainer.size() + " from " + indexEntity.size() +" URLs, " + unknownURLEntries.size() + " not bound) for word " + indexContainer.wordHash()); + tmpContainers.add(indexContainer); } catch (kelondroException e) { this.log.logSevere("plasmaWordIndexDistribution/2: deleted DB for word " + indexEntity.wordHash(), e); indexEntity.deleteComplete(); @@ -438,8 +398,8 @@ public final class plasmaWordIndexDistribution { } // transfer to array - plasmaWordIndexEntity[] indexEntities = (plasmaWordIndexEntity[]) tmpEntities.toArray(new plasmaWordIndexEntity[tmpEntities.size()]); - return new Object[]{indexEntities, knownURLs, new Integer(currOpenFiles)}; + plasmaWordIndexEntryContainer[] entryContainers = (plasmaWordIndexEntryContainer[]) tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]); + return new Object[]{entryContainers, knownURLs, new Integer(currOpenFiles)}; } catch (IOException e) { this.log.logSevere("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage(), e); return new Object[]{new plasmaWordIndexEntity[0], new HashMap(0)}; @@ -477,6 +437,40 @@ public final class plasmaWordIndexDistribution { } catch (IOException ee) {} } + void closeTransferIndexes(plasmaWordIndexEntryContainer[] indexContainers) { + for (int i = 0; i < indexContainers.length; i++) { + indexContainers[i] = null; + } + } + + boolean deleteTransferIndexes(plasmaWordIndexEntryContainer[] indexContainers) throws IOException { + Iterator urlIter; + plasmaWordIndexEntry indexEntry; + plasmaWordIndexEntity indexEntity; + String[] urlHashes; + int sz; + boolean success = true; + for (int i = 0; i < indexContainers.length; i++) { + // delete entries separately + int c = 0; + urlHashes = new String[indexContainers[i].size()]; + urlIter = indexContainers[i].entries(); + while (urlIter.hasNext()) { + indexEntry = (plasmaWordIndexEntry) urlIter.next(); + urlHashes[c++] = indexEntry.getUrlHash(); + } + wordIndex.removeEntries(indexContainers[i].wordHash(), urlHashes, true); + indexEntity = wordIndex.getEntity(indexContainers[i].wordHash(), true, -1); + sz = indexEntity.size(); + // indexEntity.close(); + closeTransferIndex(indexEntity); + log.logFine("Deleted partial index (" + c + " URLs) for word " + indexContainers[i].wordHash() + "; " + sz + " entries left"); + indexContainers[i] = null; + } + return success; + } + +/* boolean deleteTransferIndexes(plasmaWordIndexEntity[] indexEntities) throws IOException { Iterator urlIter; plasmaWordIndexEntry indexEntry; @@ -500,13 +494,6 @@ public final class plasmaWordIndexDistribution { // indexEntity.close(); closeTransferIndex(indexEntity); log.logFine("Deleted partial index (" + c + " URLs) for word " + indexEntities[i].wordHash() + "; " + sz + " entries left"); - // DEBUG: now try to delete the remaining index. If this works, this routine is fine - /* - if (wordIndex.getEntity(indexEntities[i].wordHash()).deleteComplete()) - System.out.println("DEBUG: trial delete of partial word index " + indexEntities[i].wordHash() + " SUCCESSFULL"); - else - System.out.println("DEBUG: trial delete of partial word index " + indexEntities[i].wordHash() + " FAILED"); - */ // end debug indexEntities[i].close(); } else { @@ -516,7 +503,7 @@ public final class plasmaWordIndexDistribution { } else { indexEntities[i].close(); // have another try... - if (!(plasmaWordIndexEntity.wordHash2path(wordIndex.getRoot() /*PLASMADB*/, indexEntities[i].wordHash()).delete())) { + if (!(plasmaWordIndexEntity.wordHash2path(wordIndex.getRoot(), indexEntities[i].wordHash()).delete())) { success = false; log.logSevere("Could not delete whole index for word " + indexEntities[i].wordHash()); } @@ -526,7 +513,8 @@ public final class plasmaWordIndexDistribution { } return success; } - + */ + public void startTransferWholeIndex(yacySeed seed, boolean delete) { if (transferIdxThread == null) { this.transferIdxThread = new transferIndexThread(seed,delete); @@ -573,14 +561,14 @@ public final class plasmaWordIndexDistribution { // word chunk private String endPointHash; private String startPointHash; - plasmaWordIndexEntity[] indexEntities; + plasmaWordIndexEntryContainer[] indexContainers; // other fields HashMap urlCache; public transferIndexWorkerThread( yacySeed seed, - plasmaWordIndexEntity[] indexEntities, + plasmaWordIndexEntryContainer[] indexContainers, HashMap urlCache, boolean gzipBody, int timeout, @@ -594,7 +582,7 @@ public final class plasmaWordIndexDistribution { this.timeout4Transfer = timeout; this.iteration = iteration; this.seed = seed; - this.indexEntities = indexEntities; + this.indexContainers = indexContainers; this.urlCache = urlCache; this.idxCount = idxCount; this.chunkSize = chunkSize; @@ -657,11 +645,11 @@ public final class plasmaWordIndexDistribution { // transfering seleted words to remote peer this.status = "Running: Transfering chunk " + iteration; - String error = yacyClient.transferIndex(seed, indexEntities, urlCache, gzipBody4Transfer, timeout4Transfer); + String error = yacyClient.transferIndex(seed, indexContainers, urlCache, gzipBody4Transfer, timeout4Transfer); if (error == null) { // words successfully transfered transferTime = System.currentTimeMillis() - start; - plasmaWordIndexDistribution.this.log.logInfo("Index transfer of " + idxCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length-1].wordHash() + "]" + + plasmaWordIndexDistribution.this.log.logInfo("Index transfer of " + idxCount + " words [" + indexContainers[0].wordHash() + " .. " + indexContainers[indexContainers.length-1].wordHash() + "]" + " to peer " + seed.getName() + ":" + seed.hash + " in " + (transferTime/1000) + " seconds successfull (" + (1000 * idxCount / (transferTime + 1)) + " words/s)"); retryCount = 0; @@ -817,7 +805,7 @@ public final class plasmaWordIndexDistribution { } public void performTransferWholeIndex() { - plasmaWordIndexEntity[] newIndexEntities = null, oldIndexEntities = null; + plasmaWordIndexEntryContainer[] newIndexContainers = null, oldIndexContainers = null; try { // pausing the regular index distribution // TODO: adding sync, to wait for a still running index distribution to finish @@ -838,12 +826,12 @@ public final class plasmaWordIndexDistribution { iteration++; int idxCount = 0; selectionStart = System.currentTimeMillis(); - oldIndexEntities = newIndexEntities; + oldIndexContainers = newIndexContainers; // selecting 500 words to transfer this.status = "Running: Selecting chunk " + iteration; - Object[] selectResult = selectTransferIndexes(this.startPointHash, this.chunkSize, this.maxOpenFiles4Transfer - openedFiles.intValue()); - newIndexEntities = (plasmaWordIndexEntity[]) selectResult[0]; + Object[] selectResult = selectTransferContainers(this.startPointHash, this.chunkSize, this.maxOpenFiles4Transfer - openedFiles.intValue()); + newIndexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0]; HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry openedFiles = (Integer) selectResult[2]; @@ -851,7 +839,7 @@ public final class plasmaWordIndexDistribution { * a) no words are left in the index * b) max open file limit was exceeded */ - if ((newIndexEntities == null) || (newIndexEntities.length == 0)) { + if ((newIndexContainers == null) || (newIndexContainers.length == 0)) { if (sb.wordIndex.size() > 0) { // if there are still words in the index we try it again now startPointHash = "------------"; @@ -863,15 +851,15 @@ public final class plasmaWordIndexDistribution { } } else { // count the indexes again, can be smaller as expected - for (int i = 0; i < newIndexEntities.length; i++) idxCount += newIndexEntities[i].size(); + for (int i = 0; i < newIndexContainers.length; i++) idxCount += newIndexContainers[i].size(); // getting start point for next DHT-selection oldStartingPointHash = startPointHash; - startPointHash = newIndexEntities[newIndexEntities.length - 1].wordHash(); // DHT targets must have greater hashes + startPointHash = newIndexContainers[newIndexContainers.length - 1].wordHash(); // DHT targets must have greater hashes selectionEnd = System.currentTimeMillis(); selectionTime = selectionEnd - selectionStart; - plasmaWordIndexDistribution.this.log.logInfo("Index selection of " + idxCount + " words [" + newIndexEntities[0].wordHash() + " .. " + newIndexEntities[newIndexEntities.length-1].wordHash() + "]" + + plasmaWordIndexDistribution.this.log.logInfo("Index selection of " + idxCount + " words [" + newIndexContainers[0].wordHash() + " .. " + newIndexContainers[newIndexContainers.length-1].wordHash() + "]" + " in " + (selectionTime / 1000) + " seconds (" + (1000 * idxCount / (selectionTime+1)) + " words/s)"); @@ -886,10 +874,10 @@ public final class plasmaWordIndexDistribution { this.status = "Aborted because of Transfer error:\n" + worker.getStatus(); // cleanup. closing all open files - closeEntities(oldIndexEntities); - oldIndexEntities = null; - closeEntities(newIndexEntities); - newIndexEntities = null; + closeContainers(oldIndexContainers); + oldIndexContainers = null; + closeContainers(newIndexContainers); + newIndexContainers = null; // abort index transfer return; @@ -922,10 +910,10 @@ public final class plasmaWordIndexDistribution { if (delete) { this.status = "Running: Deleting chunk " + iteration; try { - if (deleteTransferIndexes(oldIndexEntities)) { - plasmaWordIndexDistribution.this.log.logFine("Deleted all " + oldIndexEntities.length + " transferred whole-word indexes locally"); + if (deleteTransferIndexes(oldIndexContainers)) { + plasmaWordIndexDistribution.this.log.logFine("Deleted all " + oldIndexContainers.length + " transferred whole-word indexes locally"); transferedEntryCount += idxCount; - transferedEntityCount += oldIndexEntities.length; + transferedEntityCount += oldIndexContainers.length; } else { plasmaWordIndexDistribution.this.log.logSevere("Deleted not all transferred whole-word indexes"); } @@ -933,18 +921,18 @@ public final class plasmaWordIndexDistribution { plasmaWordIndexDistribution.this.log.logSevere("Deletion of indexes not possible:" + ee.getMessage(), ee); } } else { - this.closeEntities(oldIndexEntities); + this.closeContainers(oldIndexContainers); transferedEntryCount += idxCount; - transferedEntityCount += oldIndexEntities.length; + transferedEntityCount += oldIndexContainers.length; } - oldIndexEntities = null; + oldIndexContainers = null; } this.worker = null; } // handover chunk to transfer worker - if (!((newIndexEntities == null) || (newIndexEntities.length == 0))) { - worker = new transferIndexWorkerThread(seed,newIndexEntities,urlCache,gzipBody4Transfer,timeout4Transfer,iteration,idxCount,idxCount,startPointHash,oldStartingPointHash); + if (!((newIndexContainers == null) || (newIndexContainers.length == 0))) { + worker = new transferIndexWorkerThread(seed,newIndexContainers,urlCache,gzipBody4Transfer,timeout4Transfer,iteration,idxCount,idxCount,startPointHash,oldStartingPointHash); worker.start(); } } @@ -961,30 +949,21 @@ public final class plasmaWordIndexDistribution { try {worker.join();}catch(Exception e){} // worker = null; } - if (oldIndexEntities != null) closeEntities(oldIndexEntities); - if (newIndexEntities != null) closeEntities(newIndexEntities); + if (oldIndexContainers != null) closeContainers(oldIndexContainers); + if (newIndexContainers != null) closeContainers(newIndexContainers); plasmaWordIndexDistribution.this.paused = false; } } - private void closeEntities(plasmaWordIndexEntity[] indexEntities) { - if ((indexEntities == null)||(indexEntities.length ==0)) return; + private void closeContainers(plasmaWordIndexEntryContainer[] indexContainers) { + if ((indexContainers == null)||(indexContainers.length ==0)) return; - for (int i = 0; i < indexEntities.length; i++) try { - indexEntities[i].close(); - } catch (IOException ee) {} - } - - /* - private boolean isAborted() { - if (finished || Thread.currentThread().isInterrupted()) { - this.status = "aborted"; - return true; - } - return false; + for (int i = 0; i < indexContainers.length; i++) { + indexContainers[i] = null; + } } - */ + } } diff --git a/source/de/anomic/plasma/plasmaWordIndexEntity.java b/source/de/anomic/plasma/plasmaWordIndexEntity.java index 94ee53522..723cf82a7 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntity.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntity.java @@ -48,7 +48,6 @@ import java.io.File; import java.io.IOException; import java.util.Iterator; import java.util.TreeMap; -import java.util.Set; import de.anomic.kelondro.kelondroRecords; import de.anomic.kelondro.kelondroTree; import de.anomic.kelondro.kelondroException; @@ -111,6 +110,7 @@ public final class plasmaWordIndexEntity { hash.substring(4,6) + "/" + hash + ".db"); } + /* public plasmaWordIndexEntity(String wordHash) { // this creates a nameless temporary index. It is needed for combined search // and used to hold the intersection of two indexes @@ -121,7 +121,7 @@ public final class plasmaWordIndexEntity { theLocation = null; theTmpMap = new TreeMap(); } - +*/ public boolean isTMPEntity() { return theTmpMap != null; } @@ -302,12 +302,6 @@ public final class plasmaWordIndexEntity { else return "EMPTY"; } - // join methods - private static int log2(int x) { - int l = 0; - while (x > 0) {x = x >> 1; l++;} - return l; - } public void merge(plasmaWordIndexEntity otherEntity, long time) throws IOException { // this is a merge of another entity to this entity @@ -324,6 +318,14 @@ public final class plasmaWordIndexEntity { } } + /* + // join methods + private static int log2(int x) { + int l = 0; + while (x > 0) {x = x >> 1; l++;} + return l; + } + public static plasmaWordIndexEntity joinEntities(Set entities, long time) throws IOException { // big problem here: there cannot be a time-out for join, since a time-out will leave the joined set too big. @@ -485,5 +487,5 @@ public final class plasmaWordIndexEntity { } return conj; } - +*/ } \ No newline at end of file diff --git a/source/de/anomic/plasma/plasmaWordIndexEntry.java b/source/de/anomic/plasma/plasmaWordIndexEntry.java index e65d3c136..9cec2ac34 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntry.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntry.java @@ -100,18 +100,21 @@ public final class plasmaWordIndexEntry { public static final char DT_UNKNOWN = 'u'; // appearance locations: (used for flags) - public static final int AP_TITLE = 0; // title tag from html header - public static final int AP_H1 = 1; // h1-tag - public static final int AP_H2 = 2; // h2-tag - public static final int AP_H3 = 3; // h3-tag - public static final int AP_H4 = 4; // h4-tag - public static final int AP_H5 = 5; // h5-tag - public static final int AP_H6 = 6; // h6-tag - public static final int AP_TEXT = 7; // word appears in text (used to check validation of other appearances against spam) - public static final int AP_URL = 8; // word inside an url - public static final int AP_IMG = 9; // tag inside image references - public static final int AP_TAG = 10; // for tagged indexeing (i.e. using mp3 tags) - public static final int AP_ANCHOR = 11; // anchor description + public static final int AP_TITLE = 0; // title tag from html header + public static final int AP_H1 = 1; // h1-tag + public static final int AP_H2 = 2; // h2-tag + public static final int AP_H3 = 3; // h3-tag + public static final int AP_H4 = 4; // h4-tag + public static final int AP_H5 = 5; // h5-tag + public static final int AP_H6 = 6; // h6-tag + public static final int AP_TEXT = 7; // word appears in text (used to check validation of other appearances against spam) + public static final int AP_URL = 8; // word inside an url + public static final int AP_IMG = 9; // tag inside image references + public static final int AP_TAG = 10; // for tagged indexeing (i.e. using mp3 tags) + public static final int AP_ANCHOR = 11; // anchor description + public static final int AP_BOLD = 12; + public static final int AP_ITALICS = 13; + public static final int AP_INVISIBLE = 14; // good for spam detection // URL attributes public static final int UA_LOCAL = 0; // URL was crawled locally @@ -208,6 +211,8 @@ public final class plasmaWordIndexEntry { // the class instantiation can only be done by a plasmaStore method // therefore they are all public public plasmaWordIndexEntry(String urlHash, + int urlLength, // byte-length of complete URL + int urlComps, // number of path components int hitcount, //*how often appears this word in the text int wordcount, //*total number of words int phrasecount, //*total number of phrases @@ -227,14 +232,9 @@ public final class plasmaWordIndexEntry { // more needed attributes: // - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag etc // - boolean: URL attributes - // - int: url-length (shorter are better) - // - int: url-number of components / length of path // - int: length of description tag / title tag (longer are better) - // - int: number of chapters // - int: # of outlinks to same domain // - int: # of outlinks to outside domain - // - int: length of description - // - int: length of title // - int: # of keywords if ((language == null) || (language.length() != plasmaURL.urlLanguageLength)) language = "uk"; diff --git a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java index 458ada0db..2737d5664 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java @@ -54,12 +54,14 @@ package de.anomic.plasma; import java.util.HashMap; import java.util.Iterator; +import java.util.Set; +import java.util.TreeMap; import de.anomic.kelondro.kelondroBase64Order; public final class plasmaWordIndexEntryContainer implements Comparable { - private final String wordHash; + private String wordHash; private final HashMap container; // urlHash/plasmaWordIndexEntry - Mapping private long updateTime; @@ -73,6 +75,15 @@ public final class plasmaWordIndexEntryContainer implements Comparable { container = new HashMap(initContainerSize); // a urlhash/plasmaWordIndexEntry - relation } + public void setWordHash(String newWordHash) { + // this is used to replicate a container for different word indexes during global search + this.wordHash = newWordHash; + } + + public void clear() { + container.clear(); + } + public int size() { return container.size(); } @@ -85,14 +96,18 @@ public final class plasmaWordIndexEntryContainer implements Comparable { return wordHash; } + public int add(plasmaWordIndexEntry entry) { + return add(entry, System.currentTimeMillis()); + } + public int add(plasmaWordIndexEntry entry, long updateTime) { this.updateTime = java.lang.Math.max(this.updateTime, updateTime); - return (add(entry)) ? 1 : 0; + return (addi(entry)) ? 1 : 0; } public int add(plasmaWordIndexEntry[] entries, long updateTime) { int c = 0; - for (int i = 0; i < entries.length; i++) if (add(entries[i])) c++; + for (int i = 0; i < entries.length; i++) if (addi(entries[i])) c++; this.updateTime = java.lang.Math.max(this.updateTime, updateTime); return c; } @@ -102,13 +117,13 @@ public final class plasmaWordIndexEntryContainer implements Comparable { Iterator i = c.entries(); int x = 0; while (i.hasNext()) { - if (add((plasmaWordIndexEntry) i.next())) x++; + if (addi((plasmaWordIndexEntry) i.next())) x++; } this.updateTime = java.lang.Math.max(this.updateTime, c.updateTime); return x; } - private boolean add(plasmaWordIndexEntry entry) { + private boolean addi(plasmaWordIndexEntry entry) { // returns true if the new entry was added, false if it already existet return (container.put(entry.getUrlHash(), entry) == null); } @@ -117,10 +132,18 @@ public final class plasmaWordIndexEntryContainer implements Comparable { return container.containsKey(urlHash); } + public plasmaWordIndexEntry get(String urlHash) { + return (plasmaWordIndexEntry) container.get(urlHash); + } + public plasmaWordIndexEntry[] getEntryArray() { return (plasmaWordIndexEntry[]) container.values().toArray(); } + public plasmaWordIndexEntry remove(String urlHash) { + return (plasmaWordIndexEntry) container.remove(urlHash); + } + public Iterator entries() { // returns an iterator of plasmaWordIndexEntry objects return container.values().iterator(); @@ -146,4 +169,126 @@ public final class plasmaWordIndexEntryContainer implements Comparable { return (int) kelondroBase64Order.enhancedCoder.decodeLong(this.wordHash.substring(0, 4)); } + public static plasmaWordIndexEntryContainer joinContainer(Set containers, long time, int maxDistance) { + + long stamp = System.currentTimeMillis(); + + // order entities by their size + TreeMap map = new TreeMap(); + plasmaWordIndexEntryContainer singleContainer; + Iterator i = containers.iterator(); + int count = 0; + while (i.hasNext()) { + // get next entity: + singleContainer = (plasmaWordIndexEntryContainer) i.next(); + + // check result + if ((singleContainer == null) || (singleContainer.size() == 0)) return new plasmaWordIndexEntryContainer(null); // as this is a cunjunction of searches, we have no result if any word is not known + + // store result in order of result size + map.put(new Long(singleContainer.size() * 1000 + count), singleContainer); + count++; + } + + // check if there is any result + if (map.size() == 0) return new plasmaWordIndexEntryContainer(null); // no result, nothing found + + // the map now holds the search results in order of number of hits per word + // we now must pairwise build up a conjunction of these sets + Long k = (Long) map.firstKey(); // the smallest, which means, the one with the least entries + plasmaWordIndexEntryContainer searchA, searchB, searchResult = (plasmaWordIndexEntryContainer) map.remove(k); + while ((map.size() > 0) && (searchResult.size() > 0)) { + // take the first element of map which is a result and combine it with result + k = (Long) map.firstKey(); // the next smallest... + time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis(); + searchA = searchResult; + searchB = (plasmaWordIndexEntryContainer) map.remove(k); + searchResult = plasmaWordIndexEntryContainer.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1), maxDistance); + // free resources + searchA = null; + searchB = null; + } + + // in 'searchResult' is now the combined search result + if (searchResult.size() == 0) return new plasmaWordIndexEntryContainer(null); + return searchResult; + } + + // join methods + private static int log2(int x) { + int l = 0; + while (x > 0) {x = x >> 1; l++;} + return l; + } + + public static plasmaWordIndexEntryContainer joinConstructive(plasmaWordIndexEntryContainer i1, plasmaWordIndexEntryContainer i2, long time, int maxDistance) { + if ((i1 == null) || (i2 == null)) return null; + if ((i1.size() == 0) || (i2.size() == 0)) return new plasmaWordIndexEntryContainer(null); + + // decide which method to use + int high = ((i1.size() > i2.size()) ? i1.size() : i2.size()); + int low = ((i1.size() > i2.size()) ? i2.size() : i1.size()); + int stepsEnum = 10 * (high + low - 1); + int stepsTest = 12 * log2(high) * low; + + // start most efficient method + if (stepsEnum > stepsTest) { + if (i1.size() < i2.size()) + return joinConstructiveByTest(i1, i2, time, maxDistance); + else + return joinConstructiveByTest(i2, i1, time, maxDistance); + } else { + return joinConstructiveByEnumeration(i1, i2, time, maxDistance); + } + } + + private static plasmaWordIndexEntryContainer joinConstructiveByTest(plasmaWordIndexEntryContainer small, plasmaWordIndexEntryContainer large, long time, int maxDistance) { + System.out.println("DEBUG: JOIN METHOD BY TEST"); + plasmaWordIndexEntryContainer conj = new plasmaWordIndexEntryContainer(null); // start with empty search result + Iterator se = small.entries(); + plasmaWordIndexEntry ie0, ie1; + long stamp = System.currentTimeMillis(); + while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) { + ie0 = (plasmaWordIndexEntry) se.next(); + ie1 = large.get(ie0.getUrlHash()); + if (ie1 != null) { + // this is a hit. Calculate word distance: + ie0.combineDistance(ie1); + if (ie0.worddistance() <= maxDistance) conj.add(ie0); + } + } + return conj; + } + + private static plasmaWordIndexEntryContainer joinConstructiveByEnumeration(plasmaWordIndexEntryContainer i1, plasmaWordIndexEntryContainer i2, long time, int maxDistance) { + System.out.println("DEBUG: JOIN METHOD BY ENUMERATION"); + plasmaWordIndexEntryContainer conj = new plasmaWordIndexEntryContainer(null); // start with empty search result + Iterator e1 = i1.entries(); + Iterator e2 = i2.entries(); + int c; + if ((e1.hasNext()) && (e2.hasNext())) { + plasmaWordIndexEntry ie1; + plasmaWordIndexEntry ie2; + ie1 = (plasmaWordIndexEntry) e1.next(); + ie2 = (plasmaWordIndexEntry) e2.next(); + + long stamp = System.currentTimeMillis(); + while ((System.currentTimeMillis() - stamp) < time) { + c = ie1.getUrlHash().compareTo(ie2.getUrlHash()); + if (c < 0) { + if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break; + } else if (c > 0) { + if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break; + } else { + // we have found the same urls in different searches! + ie1.combineDistance(ie2); + if (ie1.worddistance() <= maxDistance) conj.add(ie1); + if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break; + if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break; + } + } + } + return conj; + } + } diff --git a/source/de/anomic/plasma/plasmaWordIndexInterface.java b/source/de/anomic/plasma/plasmaWordIndexInterface.java index 078518d2a..2026d8f59 100644 --- a/source/de/anomic/plasma/plasmaWordIndexInterface.java +++ b/source/de/anomic/plasma/plasmaWordIndexInterface.java @@ -50,7 +50,8 @@ public interface plasmaWordIndexInterface { public Iterator wordHashes(String startWordHash, boolean up); - public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty, long maxTime); + public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime); + public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime); public long getUpdateTime(String wordHash); public void deleteIndex(String wordHash); diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index e6f6d05e2..76d550912 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -50,12 +50,13 @@ import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.Iterator; + +import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpc; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.plasma.plasmaWordIndexEntity; import de.anomic.plasma.plasmaWordIndexEntry; import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.plasma.plasmaURLPattern; @@ -348,14 +349,15 @@ public final class yacyClient { } public static int search( - String wordhashes, + String wordhashes, + int maxDistance, boolean global, yacySeed targetPeer, - plasmaCrawlLURL urlManager, - plasmaWordIndexEntity entityCache, - plasmaURLPattern blacklist, - plasmaSnippetCache snippets, - plasmaSearchProfile profile + plasmaCrawlLURL urlManager, + plasmaWordIndexEntryContainer containerCache, + plasmaURLPattern blacklist, + plasmaSnippetCache snippets, + plasmaSearchProfile profile ) { // send a search request to peer with remote Hash // this mainly converts the words into word hashes @@ -403,6 +405,7 @@ public final class yacyClient { obj.put("ttl", "0"); obj.put("duetime", Long.toString(duetime)); obj.put("profile", profile.targetToString()); // new duetimes splitted by specific search tasks + obj.put("maxdist", maxDistance); obj.put(yacySeed.MYTIME, yacyCore.universalDateShortString(new Date())); //yacyCore.log.logDebug("yacyClient.search url=" + url); @@ -460,6 +463,9 @@ public final class yacyClient { // get one single search result urlEntry = urlManager.newEntry((String) result.get("resource" + n), true); if (urlEntry != null && blacklist.isListed(urlEntry.url().getHost().toLowerCase(), urlEntry.url().getPath())) { continue; } // block with backlist + int urlLength = urlEntry.url().toString().length(); + int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length; + urlManager.addEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2); // save the url entry final plasmaWordIndexEntry entry; @@ -467,6 +473,7 @@ public final class yacyClient { // the old way to define words entry = new plasmaWordIndexEntry( urlEntry.hash(), + urlLength, urlComps, urlEntry.wordCount(), 0, 0, 0, 0, 0, 0, urlEntry.size(), @@ -494,7 +501,7 @@ public final class yacyClient { } // finally insert the containers to the index - for (int m = 0; m < words; m++) { entityCache.addEntries(container[m]); } + for (int m = 0; m < words; m++) { containerCache.add(container[m]); } // generate statistics long searchtime; @@ -841,7 +848,7 @@ public final class yacyClient { httpHeader requestHeader) throws IOException { */ - public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntity[] indexes, HashMap urlCache, boolean gzipBody, int timeout) { + public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntryContainer[] indexes, HashMap urlCache, boolean gzipBody, int timeout) { HashMap in = transferRWI(targetSeed, indexes, gzipBody, timeout); if (in == null) { return "no_connection_1"; } @@ -875,7 +882,7 @@ public final class yacyClient { return null; } - private static HashMap transferRWI(yacySeed targetSeed, plasmaWordIndexEntity[] indexes, boolean gzipBody, int timeout) { + private static HashMap transferRWI(yacySeed targetSeed, plasmaWordIndexEntryContainer[] indexes, boolean gzipBody, int timeout) { final String address = targetSeed.getAddress(); if (address == null) { return null; } @@ -903,7 +910,7 @@ public final class yacyClient { Iterator eenum; plasmaWordIndexEntry entry; for (int i = 0; i < indexes.length; i++) { - eenum = indexes[i].elements(true); + eenum = indexes[i].entries(); while (eenum.hasNext()) { entry = (plasmaWordIndexEntry) eenum.next(); entrypost.append(indexes[i].wordHash()) diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index cb72dca6c..2e82cd24c 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -52,8 +52,8 @@ import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaURLPattern; import de.anomic.plasma.plasmaSnippetCache; -import de.anomic.plasma.plasmaWordIndexEntity; import de.anomic.plasma.plasmaSearchProfile; +import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.server.logging.serverLog; public class yacySearch extends Thread { @@ -61,29 +61,31 @@ public class yacySearch extends Thread { final private Set wordhashes; final private boolean global; final private plasmaCrawlLURL urlManager; - final private plasmaWordIndexEntity entityCache; + final private plasmaWordIndexEntryContainer containerCache; final private plasmaURLPattern blacklist; final private plasmaSnippetCache snippetCache; final private yacySeed targetPeer; private int links; + private int maxDistance; final private plasmaSearchProfile profile; - public yacySearch(Set wordhashes, boolean global, yacySeed targetPeer, - plasmaCrawlLURL urlManager, plasmaWordIndexEntity entityCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchProfile profile) { + public yacySearch(Set wordhashes, int maxDistance, boolean global, yacySeed targetPeer, + plasmaCrawlLURL urlManager, plasmaWordIndexEntryContainer containerCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchProfile profile) { super("yacySearch_" + targetPeer.getName()); this.wordhashes = wordhashes; this.global = global; this.urlManager = urlManager; - this.entityCache = entityCache; + this.containerCache = containerCache; this.blacklist = blacklist; this.snippetCache = snippetCache; this.targetPeer = targetPeer; this.links = -1; + this.maxDistance = maxDistance; this.profile = (plasmaSearchProfile) profile.clone(); } public void run() { - this.links = yacyClient.search(set2string(wordhashes), global, targetPeer, urlManager, entityCache, blacklist, snippetCache, profile); + this.links = yacyClient.search(set2string(wordhashes), maxDistance, global, targetPeer, urlManager, containerCache, blacklist, snippetCache, profile); if (links != 0) { //yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + links + " links for word hash " + wordhashes); yacyCore.seedDB.mySeed.incRI(links); @@ -172,7 +174,7 @@ public class yacySearch extends Thread { return result; } - public static yacySearch[] searchHashes(Set wordhashes, plasmaCrawlLURL urlManager, plasmaWordIndexEntity entityCache, + public static yacySearch[] searchHashes(Set wordhashes, int maxDist, plasmaCrawlLURL urlManager, plasmaWordIndexEntryContainer containerCache, int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchProfile profile) { // check own peer status if (yacyCore.seedDB.mySeed == null || yacyCore.seedDB.mySeed.getAddress() == null) { return null; } @@ -185,8 +187,8 @@ public class yacySearch extends Thread { if (targets == 0) return null; yacySearch[] searchThreads = new yacySearch[targets]; for (int i = 0; i < targets; i++) { - searchThreads[i]= new yacySearch(wordhashes, true, targetPeers[i], - urlManager, entityCache, blacklist, snippetCache, profile); + searchThreads[i]= new yacySearch(wordhashes, maxDist, true, targetPeers[i], + urlManager, containerCache, blacklist, snippetCache, profile); searchThreads[i].start(); try {Thread.sleep(20);} catch (InterruptedException e) {} @@ -216,5 +218,4 @@ public class yacySearch extends Thread { } } - }