diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java
index 10f182b55..415cfc12f 100644
--- a/htroot/IndexControl_p.java
+++ b/htroot/IndexControl_p.java
@@ -62,6 +62,7 @@ import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaWordIndexEntry;
+import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyClient;
@@ -255,12 +256,12 @@ public class IndexControl_p {
}
prop.put("urlstring", "");
prop.put("urlhash", "");
- plasmaWordIndexEntity[] indexes = new plasmaWordIndexEntity[1];
+ plasmaWordIndexEntryContainer[] indexes = new plasmaWordIndexEntryContainer[1];
String result;
long starttime = System.currentTimeMillis();
- indexes[0] = switchboard.wordIndex.getEntity(keyhash, true, -1);
+ indexes[0] = switchboard.wordIndex.getContainer(keyhash, true, -1);
// built urlCache
- Iterator urlIter = indexes[0].elements(true);
+ Iterator urlIter = indexes[0].entries();
HashMap knownURLs = new HashMap();
HashSet unknownURLEntries = new HashSet();
plasmaWordIndexEntry indexEntry;
@@ -282,9 +283,7 @@ public class IndexControl_p {
// now delete all entries that have no url entry
Iterator hashIter = unknownURLEntries.iterator();
while (hashIter.hasNext()) {
- try {
- indexes[0].removeEntry((String) hashIter.next(), false);
- } catch (IOException e) {}
+ indexes[0].remove((String) hashIter.next());
}
// use whats remaining
String gzipBody = switchboard.getConfig("indexControl.gzipBody","false");
@@ -296,7 +295,8 @@ public class IndexControl_p {
"true".equalsIgnoreCase(gzipBody),
timeout);
prop.put("result", (result == null) ? ("Successfully transferred " + indexes[0].size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result);
- try {indexes[0].close();} catch (IOException e) {}
+ indexes[0] = null;
+ indexes = null;
}
// generate list
@@ -431,15 +431,15 @@ public class IndexControl_p {
public static String genUrlList(plasmaSwitchboard switchboard, String keyhash, String keystring) {
// search for a word hash and generate a list of url links
- plasmaWordIndexEntity index = null;
+ plasmaWordIndexEntryContainer index = null;
try {
- index = switchboard.wordIndex.getEntity(keyhash, true, -1);
+ index = switchboard.wordIndex.getContainer(keyhash, true, -1);
final StringBuffer result = new StringBuffer(1024);
if (index.size() == 0) {
result.append("No URL entries related to this word hash ").append(keyhash).append(".");
} else {
- final Iterator en = index.elements(true);
+ final Iterator en = index.entries();
result.append("URL entries related to this word hash ").append(keyhash).append("
");
result.append("
");
}
- index.close();
index = null;
return result.toString();
} catch (IOException e) {
return "";
} finally {
- if (index != null) try { index.close(); index = null; } catch (Exception e) {};
+ if (index != null) index = null;
}
}
diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java
index c50f10593..7bbee28dd 100644
--- a/htroot/htdocsdefault/dir.java
+++ b/htroot/htdocsdefault/dir.java
@@ -463,7 +463,7 @@ public class dir {
"AAAAAAAAAAAA", /*referrer*/
0, /*copycount*/
false, /*localneed*/
- condenser.RESULT_INFORMATION_VALUE,
+ condenser.RESULT_WORD_ENTROPHY,
"**", /*language*/
plasmaWordIndexEntry.DT_SHARE, /*doctype*/
phrase.length(), /*size*/
diff --git a/htroot/index.java b/htroot/index.java
index d10fb0df1..4aad3693c 100644
--- a/htroot/index.java
+++ b/htroot/index.java
@@ -126,7 +126,12 @@ public class index {
// SEARCH
// process search words
- final String querystring = post.get("search", "");
+ int maxDistance = Integer.MAX_VALUE;
+ String querystring = post.get("search", "").trim();
+ if ((querystring.charAt(0) == '"') && (querystring.charAt(querystring.length() - 1) == '"')) {
+ querystring = querystring.substring(1, querystring.length() - 1).trim();
+ maxDistance = 1;
+ }
if (sb.facilityDB != null) try { sb.facilityDB.update("zeitgeist", querystring, post); } catch (Exception e) {}
final TreeSet query = plasmaSearchQuery.cleanQuery(querystring);
// filter out stopwords
@@ -172,7 +177,7 @@ public class index {
}
// do the search
- plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, new String[]{order1, order2, order3}, count, searchtime, urlmask, referer,
+ plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, maxDistance, new String[]{order1, order2, order3}, count, searchtime, urlmask, referer,
((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL,
"", 20);
final serverObjects prop = sb.searchFromLocal(thisSearch);
diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java
index 93c2940cf..cd15fa1ef 100644
--- a/htroot/yacy/search.java
+++ b/htroot/yacy/search.java
@@ -47,7 +47,6 @@
// javac -classpath .:../../Classes search.java
// if the shell's current path is htroot/yacy
-import java.io.IOException;
import java.util.HashSet;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlLURL;
@@ -81,6 +80,7 @@ public final class search {
// final String fwden = post.get("fwden", ""); // forward deny, a list of seed hashes. They may NOT be target of forward hopping
final long duetime= post.getLong("duetime", 3000);
final int count = post.getInt("count", 10); // maximum number of wanted results
+ final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE);
// final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers
// Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time
@@ -103,8 +103,8 @@ public final class search {
}
final long timestamp = System.currentTimeMillis();
- plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, new String[]{plasmaSearchQuery.ORDER_YBR, plasmaSearchQuery.ORDER_DATE, plasmaSearchQuery.ORDER_QUALITY},
- count, duetime, ".*");
+ plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, maxdist, new String[]{plasmaSearchQuery.ORDER_YBR, plasmaSearchQuery.ORDER_DATE, plasmaSearchQuery.ORDER_QUALITY},
+ count, duetime, ".*");
squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
serverObjects prop = new serverObjects();
@@ -114,11 +114,8 @@ public final class search {
plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache);
plasmaSearchResult acc = null;
int idxc = 0;
- try {
- idxc = theSearch.localSearch();
- acc = theSearch.order();
- } catch (IOException e) {
- }
+ idxc = theSearch.localSearch();
+ acc = theSearch.order();
// result is a List of urlEntry elements
if ((idxc == 0) || (acc == null)) {
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index 38d3eaa67..dde0f89a7 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -75,6 +75,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
linkTags0.add("img");
linkTags0.add("base");
linkTags0.add("frame");
+ linkTags0.add("meta");
linkTags1 = new TreeSet(insensitiveCollator);
linkTags1.add("a");
@@ -88,6 +89,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
// class variables: collectors for links
private HashMap anchors;
private HashMap images;
+ private HashMap metas;
private String title;
//private String headline;
private List[] headlines;
@@ -101,6 +103,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
this.root = root;
this.anchors = new HashMap();
this.images = new HashMap();
+ this.metas = new HashMap();
this.title = "";
this.headlines = new ArrayList[4];
for (int i = 0; i < 4; i++) headlines[i] = new ArrayList();
@@ -193,7 +196,12 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return null;
}
}
-
+
+ public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
+ public static String[] urlComps(String normalizedURL) {
+ return normalizedURL.toLowerCase().split(splitrex); // word components of the url
+ }
+
private String absolutePath(String relativePath) {
try {
return urlNormalform(new URL(root, relativePath));
@@ -206,6 +214,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
if (tagname.equalsIgnoreCase("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt",""));
if (tagname.equalsIgnoreCase("base")) try {root = new URL(tagopts.getProperty("href", ""));} catch (MalformedURLException e) {}
if (tagname.equalsIgnoreCase("frame")) anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name",""));
+ if (tagname.equalsIgnoreCase("meta")) metas.put((tagopts.getProperty("name", "")).toLowerCase(), tagopts.getProperty("content",""));
}
public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
@@ -252,10 +261,16 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
// construct a title string, even if the document has no title
// if there is one, return it
if (title.length() > 0) return title;
+
// othervise take any headline
for (int i = 0; i < 4; i++) {
if (headlines[i].size() > 0) return (String) headlines[i].get(0);
}
+
+ // take description tag
+ String s = getDescription();
+ if (s.length() > 0) return s;
+
// extract headline from content
if (content.length() > 80) return cleanLine(new String(content.getBytes(), 0, 80));
return cleanLine(content.trim().toString());
@@ -280,6 +295,45 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return images;
}
+ public Map getMetas() {
+ return metas;
+ }
+
+ public String getDescription() {
+ String s = (String) metas.get("description");
+ if (s == null) return ""; else return s;
+ }
+
+ public String getContentType() {
+ String s = (String) metas.get("content-type");
+ if (s == null) return ""; else return s;
+ }
+
+ public String getCopyright() {
+ String s = (String) metas.get("copyright");
+ if (s == null) return ""; else return s;
+ }
+
+ public String[] getContentLanguages() {
+ String s = (String) metas.get("content-language");
+ if (s == null) s = "";
+ return s.split(" |,");
+ }
+
+ public String[] getKeywords() {
+ String s = (String) metas.get("keywords");
+ if (s == null) s = "";
+ if (s.length() == 0) {
+ return getTitle().toLowerCase().split(splitrex);
+ } else {
+ return s.split(" |,");
+ }
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see de.anomic.htmlFilter.htmlFilterScraper#close()
+ */
public void close() {
// free resources
super.close();
@@ -298,6 +352,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
}
System.out.println("ANCHORS :" + anchors.toString());
System.out.println("IMAGES :" + images.toString());
+ System.out.println("METAS :" + metas.toString());
System.out.println("TEXT :" + new String(content.getBytes()));
}
diff --git a/source/de/anomic/kelondro/kelondroTree.java b/source/de/anomic/kelondro/kelondroTree.java
index a34e23c59..96e5a25bb 100644
--- a/source/de/anomic/kelondro/kelondroTree.java
+++ b/source/de/anomic/kelondro/kelondroTree.java
@@ -850,11 +850,11 @@ public class kelondroTree extends kelondroRecords implements kelondroIndex {
this.rot = rotating;
ii = new nodeIterator(asc, rot, start);
nextNode = (ii.hasNext()) ? (Node) ii.next() : null;
- if (nextNode != null) {
+ if ((nextNode != null) && (nextNode.getKey() != null)) {
int c = objectOrder.compare(firstKey, nextNode.getKey());
if ((c > 0) && (asc)) {
// firstKey > nextNode.getKey()
- log.logWarning("CORRECTING ITERATOR: firstKey=" + new String(firstKey) + ", nextNode=" + new String(nextNode.getKey()));
+ if (log != null) log.logWarning("CORRECTING ITERATOR: firstKey=" + new String(firstKey) + ", nextNode=" + new String(nextNode.getKey()));
nextNode = (ii.hasNext()) ? (Node) ii.next() : null;
}
if ((c < 0) && (!(asc))) {
diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java
index b912a4101..830fbb2a2 100644
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@@ -83,9 +83,6 @@ public final class plasmaCondenser {
public int RESULT_NUMB_SENTENCES = -1;
public int RESULT_DIFF_SENTENCES = -1;
public int RESULT_SIMI_SENTENCES = -1;
- public int RESULT_AVERAGE_WORD_OCC = -1;
- public int RESULT_INFORMATION_VALUE = -1;
-
public plasmaCondenser(InputStream text) {
this(text, 3, 2);
@@ -357,8 +354,7 @@ public final class plasmaCondenser {
this.RESULT_NUMB_SENTENCES = allsentencecounter;
this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
this.RESULT_SIMI_SENTENCES = sentences.size();
- this.RESULT_AVERAGE_WORD_OCC = (words.size() == 0) ? 0 : (allwordcounter / words.size());
- this.RESULT_INFORMATION_VALUE = (allwordcounter == 0) ? 0 : (wordenum.count() * words.size() / allwordcounter / 16);
+ //this.RESULT_INFORMATION_VALUE = (allwordcounter == 0) ? 0 : (wordenum.count() * words.size() / allwordcounter / 16);
}
public void print() {
diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java
index 9bf2408a1..6b44693a1 100644
--- a/source/de/anomic/plasma/plasmaCrawlLURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlLURL.java
@@ -176,7 +176,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
}
- public synchronized Entry getEntry(String hash, plasmaWordIndexEntry searchedWord) throws IOException {
+ public Entry getEntry(String hash, plasmaWordIndexEntry searchedWord) throws IOException {
return new Entry(hash, searchedWord);
}
@@ -399,8 +399,16 @@ public final class plasmaCrawlLURL extends plasmaURL {
private int size;
private int wordCount;
private String snippet;
- private plasmaWordIndexEntry word;
-
+ private plasmaWordIndexEntry word; // this is only used if the url is transported via remote search requests
+
+ // more needed attributes:
+ // - author / copyright owner
+ // - keywords
+ // - phrasecount, total number of phrases
+ // - boolean: URL attributes
+ // - int: # of outlinks to same domain
+ // - int: # of outlinks to outside domain
+
public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) {
// create new entry and store it into database
this.urlHash = urlHash(url);
diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java
index 249213774..baf32b78e 100644
--- a/source/de/anomic/plasma/plasmaSearchEvent.java
+++ b/source/de/anomic/plasma/plasmaSearchEvent.java
@@ -63,7 +63,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
private plasmaWordIndex wordIndex;
private plasmaCrawlLURL urlStore;
private plasmaSnippetCache snippetCache;
- private plasmaWordIndexEntity rcLocal, rcGlobal; // caches for results
+ private plasmaWordIndexEntryContainer rcLocal, rcGlobal; // caches for results
private plasmaSearchProfile profileLocal, profileGlobal;
private yacySearch[] searchThreads;
@@ -73,8 +73,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
this.query = query;
this.urlStore = urlStore;
this.snippetCache = snippetCache;
- this.rcLocal = new plasmaWordIndexEntity(null);
- this.rcGlobal = new plasmaWordIndexEntity(null);
+ this.rcLocal = new plasmaWordIndexEntryContainer(null);
+ this.rcGlobal = new plasmaWordIndexEntryContainer(null);
if (query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) {
this.profileLocal = new plasmaSearchProfile(4 * query.maximumTime / 10, query.wantedResults);
this.profileGlobal = new plasmaSearchProfile(6 * query.maximumTime / 10, query.wantedResults);
@@ -114,68 +114,56 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
int globalContributions = globalSearch(fetchpeers);
log.logFine("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
- try {
- // combine the result and order
- plasmaSearchResult result = order();
- result.globalContributions = globalContributions;
- result.localContributions = rcLocal.size();
-
- // flush results in a separate thread
- this.start(); // start to flush results
- //serverInstantThread.oneTimeJob(this, "flushResults", log, 0);
-
- // clean up
- if ((rcLocal != null) && (!(rcLocal.isTMPEntity()))) rcLocal.close();
- rcLocal = null;
-
- // return search result
- log.logFine("SEARCHRESULT: " + profileLocal.reportToString());
- lastEvent = this;
- return result;
- } catch (IOException e) {
- return null;
- }
+ // combine the result and order
+ plasmaSearchResult result = order();
+ result.globalContributions = globalContributions;
+ result.localContributions = rcLocal.size();
+
+ // flush results in a separate thread
+ this.start(); // start to flush results
+ //serverInstantThread.oneTimeJob(this, "flushResults", log, 0);
+
+ // clean up
+ rcLocal = null;
+
+ // return search result
+ log.logFine("SEARCHRESULT: " + profileLocal.reportToString());
+ lastEvent = this;
+ return result;
} else {
- // do a local search
- //long start = System.currentTimeMillis();
- try {
- localSearch();
- plasmaSearchResult result = order();
- result.localContributions = rcLocal.size();
-
- // clean up
- if ((rcLocal != null) && (!(rcLocal.isTMPEntity()))) rcLocal.close();
- rcLocal = null;
-
- // return search result
- log.logFine("SEARCHRESULT: " + profileLocal.reportToString());
- lastEvent = this;
- return result;
- } catch (IOException e) {
- return null;
- }
+ localSearch();
+ plasmaSearchResult result = order();
+ result.localContributions = rcLocal.size();
+
+ // clean up
+ rcLocal = null;
+
+ // return search result
+ log.logFine("SEARCHRESULT: " + profileLocal.reportToString());
+ lastEvent = this;
+ return result;
}
}
- public int localSearch() throws IOException {
+ public int localSearch() {
// search for the set of hashes and return an array of urlEntry elements
// retrieve entities that belong to the hashes
profileLocal.startTimer();
- Set entities = wordIndex.getEntities(query.queryHashes, true, true, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_COLLECTION));
- if (entities.size() < query.size()) entities = null; // prevent that only a subset is returned
+ Set containers = wordIndex.getContainers(query.queryHashes, true, true, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_COLLECTION));
+ if (containers.size() < query.size()) containers = null; // prevent that only a subset is returned
profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_COLLECTION);
- profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_COLLECTION, (entities == null) ? 0 : entities.size());
+ profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_COLLECTION, (containers == null) ? 0 : containers.size());
// since this is a conjunction we return an empty entity if any word is not known
- if (entities == null) {
- rcLocal = new plasmaWordIndexEntity(null);
+ if (containers == null) {
+ rcLocal = new plasmaWordIndexEntryContainer(null);
return 0;
}
// join the result
profileLocal.startTimer();
- rcLocal = plasmaWordIndexEntity.joinEntities(entities, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_JOIN));
+ rcLocal = plasmaWordIndexEntryContainer.joinContainer(containers, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_JOIN), query.maxDistance);
profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_JOIN);
profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_JOIN, rcLocal.size());
@@ -190,7 +178,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
log.logFine("STARTING " + fetchpeers + " THREADS TO CATCH EACH " + profileGlobal.getTargetCount(plasmaSearchProfile.PROCESS_POSTSORT) + " URLs WITHIN " + (profileGlobal.duetime() / 1000) + " SECONDS");
long timeout = System.currentTimeMillis() + profileGlobal.duetime() + 4000;
- searchThreads = yacySearch.searchHashes(query.queryHashes, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal);
+ searchThreads = yacySearch.searchHashes(query.queryHashes, query.maxDistance, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal);
// wait until wanted delay passed or wanted result appeared
while (System.currentTimeMillis() < timeout) {
@@ -204,20 +192,20 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
return rcGlobal.size();
}
- public plasmaSearchResult order() throws IOException {
+ public plasmaSearchResult order() {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
- plasmaWordIndexEntity searchResult = new plasmaWordIndexEntity(null);
- searchResult.merge(rcLocal, -1);
- searchResult.merge(rcGlobal, -1);
+ plasmaWordIndexEntryContainer searchResult = new plasmaWordIndexEntryContainer(null);
+ searchResult.add(rcLocal);
+ searchResult.add(rcGlobal);
long preorderTime = profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_PRESORT);
long postorderTime = profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_POSTSORT);
profileLocal.startTimer();
plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query);
- preorder.addEntity(searchResult, preorderTime);
+ preorder.addContainer(searchResult, preorderTime);
profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_PRESORT);
profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_PRESORT, rcLocal.size());
@@ -289,19 +277,13 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
Iterator hashi = query.queryHashes.iterator();
while (hashi.hasNext()) {
wordHash = (String) hashi.next();
- Iterator i = rcGlobal.elements(true);
- plasmaWordIndexEntry entry;
- plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash, rcGlobal.size());
- while (i.hasNext()) {
- entry = (plasmaWordIndexEntry) i.next();
- container.add(entry, System.currentTimeMillis());
- }
- wordIndex.addEntries(container, true);
- log.logFine("FLUSHED " + wordHash + ": " + container.size() + " url entries");
+ rcGlobal.setWordHash(wordHash);
+ wordIndex.addEntries(rcGlobal, true);
+ log.logFine("FLUSHED " + wordHash + ": " + rcGlobal.size() + " url entries");
}
// the rcGlobal was flushed, empty it
count += rcGlobal.size();
- rcGlobal.deleteComplete();
+ rcGlobal.clear();
}
// wait a little bit before trying again
try {Thread.sleep(3000);} catch (InterruptedException e) {}
diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java
index b8fac651a..90ebb0af6 100644
--- a/source/de/anomic/plasma/plasmaSearchPreOrder.java
+++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java
@@ -116,8 +116,8 @@ public final class plasmaSearchPreOrder {
return (plasmaWordIndexEntry) pageAcc.remove(top);
}
- public void addEntity(plasmaWordIndexEntity entity, long maxTime) {
- Iterator i = entity.elements(true);
+ public void addContainer(plasmaWordIndexEntryContainer container, long maxTime) {
+ Iterator i = container.entries();
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
plasmaWordIndexEntry entry;
while (i.hasNext()) {
diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java
index 57a33dee0..b19eb2d23 100644
--- a/source/de/anomic/plasma/plasmaSearchQuery.java
+++ b/source/de/anomic/plasma/plasmaSearchQuery.java
@@ -72,12 +72,14 @@ public final class plasmaSearchQuery {
public int domType;
public String domGroupName;
public int domMaxTargets;
+ public int maxDistance;
- public plasmaSearchQuery(Set queryWords,
+ public plasmaSearchQuery(Set queryWords, int maxDistance,
String[] order, int wantedResults, long maximumTime, String urlMask,
String referrer,
int domType, String domGroupName, int domMaxTargets) {
this.queryWords = queryWords;
+ this.maxDistance = maxDistance;
this.queryHashes = words2hashes(queryWords);
this.order = order;
this.wantedResults = wantedResults;
@@ -89,9 +91,10 @@ public final class plasmaSearchQuery {
this.domMaxTargets = domMaxTargets;
}
- public plasmaSearchQuery(Set queryHashes,
+ public plasmaSearchQuery(Set queryHashes, int maxDistance,
String[] order, int wantedResults, long maximumTime, String urlMask) {
this.queryWords = null;
+ this.maxDistance = maxDistance;
this.queryHashes = queryHashes;
this.order = order;
this.wantedResults = wantedResults;
diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java
index 164805f6d..e56fcb530 100644
--- a/source/de/anomic/plasma/plasmaSearchResult.java
+++ b/source/de/anomic/plasma/plasmaSearchResult.java
@@ -54,11 +54,10 @@ import java.net.MalformedURLException;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.serverCodings;
+import de.anomic.htmlFilter.htmlFilterContentScraper;
public final class plasmaSearchResult {
- public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
-
private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic
private ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects
@@ -111,8 +110,8 @@ public final class plasmaSearchResult {
URL url = page.url();
String descr = page.descr();
if ((url == null) || (descr == null)) return;
- String[] urlcomps = url.toString().toLowerCase().split(splitrex); // word components of the url
- String[] descrcomps = descr.toLowerCase().split(splitrex); // words in the description
+ String[] urlcomps = htmlFilterContentScraper.urlComps(url.toString()); // word components of the url
+ String[] descrcomps = descr.toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description
// store everything
Object[] resultVector = new Object[] {indexEntry, page, urlcomps, descrcomps};
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 31a04422b..f4b57214f 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -1285,7 +1285,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
yacyCore.seedDB.mySeed.hash,
referrerHash,
0, true,
- condenser.RESULT_INFORMATION_VALUE,
+ condenser.RESULT_WORD_ENTROPHY,
plasmaWordIndexEntry.language(entry.url()),
plasmaWordIndexEntry.docType(document.getMimeType()),
(int) entry.size(),
@@ -1313,15 +1313,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} else {
HashMap urlCache = new HashMap(1);
urlCache.put(newEntry.hash(),newEntry);
- ArrayList tmpEntities = new ArrayList(condenser.RESULT_SIMI_WORDS);
+ ArrayList tmpContainers = new ArrayList(condenser.RESULT_SIMI_WORDS);
String language = plasmaWordIndexEntry.language(entry.url());
char doctype = plasmaWordIndexEntry.docType(document.getMimeType());
- int quality = 0;
- try {
- quality = condenser.RESULT_INFORMATION_VALUE;
- } catch (NumberFormatException e) {
- System.out.println("INTERNAL ERROR WITH CONDENSER.INFORMATION_VALUE: " + e.toString() + ": in URL " + newEntry.url().toString());
- }
+ int urlLength = newEntry.url().toString().length();
+ int urlComps = htmlFilterContentScraper.urlComps(newEntry.url().toString()).length;
// iterate over all words
Iterator i = condenser.words();
@@ -1332,8 +1328,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String word = (String) wentry.getKey();
wordStat = (plasmaCondenser.wordStatProp) wentry.getValue();
String wordHash = plasmaWordIndexEntry.word2hash(word);
- plasmaWordIndexEntity wordIdxEntity = new plasmaWordIndexEntity(wordHash);
+ plasmaWordIndexEntryContainer wordIdxContainer = new plasmaWordIndexEntryContainer(wordHash);
plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash,
+ urlLength, urlComps,
wordStat.count,
condenser.RESULT_SIMI_WORDS,
condenser.RESULT_SIMI_SENTENCES,
@@ -1344,26 +1341,25 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
newEntry.size(),
docDate.getTime(),
System.currentTimeMillis(),
- quality, language, doctype, true);
- wordIdxEntity.addEntry(wordIdxEntry);
- tmpEntities.add(wordIdxEntity);
+ condenser.RESULT_WORD_ENTROPHY,
+ language,
+ doctype,
+ true);
+ wordIdxContainer.add(wordIdxEntry);
+ tmpContainers.add(wordIdxContainer);
// wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry));
}
//System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries");
words = condenser.RESULT_SIMI_WORDS;
// transfering the index to the storage peer
- String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntity[])tmpEntities.toArray(new plasmaWordIndexEntity[tmpEntities.size()]),urlCache,true,120000);
+ String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntryContainer[])tmpContainers.toArray(new plasmaWordIndexEntity[tmpContainers.size()]),urlCache,true,120000);
if (error != null) {
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()));
}
- // cleanup
- for (int j=0; j < tmpEntities.size(); j++) {
- plasmaWordIndexEntity tmpEntity = (plasmaWordIndexEntity) tmpEntities.get(j);
- try { tmpEntity.close(); } catch (Exception e) {}
- }
+ tmpContainers = null;
}
storageEndTime = System.currentTimeMillis();
diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java
index 2e6961953..e3814224d 100644
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@@ -56,6 +56,7 @@ import java.util.Set;
import java.util.Date;
import java.net.URL;
+import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.server.logging.serverLog;
@@ -136,16 +137,8 @@ public final class plasmaWordIndex {
public int addPageIndex(URL url, String urlHash, Date urlModified, int size, plasmaCondenser condenser, String language, char doctype) {
// this is called by the switchboard to put in a new page into the index
- // use all the words in one condenser object to simultanous create index
- // entries
- // int age = microDateDays(urlModified);
- int quality = 0;
- try {
- quality = condenser.RESULT_INFORMATION_VALUE;
- } catch (NumberFormatException e) {
- System.out.println("INTERNAL ERROR WITH CONDENSER.INFORMATION_VALUE: " + e.toString() + ": in URL " + url.toString());
- }
-
+ // use all the words in one condenser object to simultanous create index entries
+
// iterate over all words
Iterator i = condenser.words();
Map.Entry wentry;
@@ -153,6 +146,9 @@ public final class plasmaWordIndex {
plasmaWordIndexEntry ientry;
plasmaCondenser.wordStatProp wprop;
String wordHash;
+ int urlLength = url.toString().length();
+ int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length;
+
while (i.hasNext()) {
wentry = (Map.Entry) i.next();
word = (String) wentry.getKey();
@@ -160,6 +156,7 @@ public final class plasmaWordIndex {
// if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
wordHash = plasmaWordIndexEntry.word2hash(word);
ientry = new plasmaWordIndexEntry(urlHash,
+ urlLength, urlComps,
wprop.count,
condenser.RESULT_SIMI_WORDS,
condenser.RESULT_SIMI_SENTENCES,
@@ -170,18 +167,54 @@ public final class plasmaWordIndex {
size,
urlModified.getTime(),
System.currentTimeMillis(),
- quality, language, doctype, true);
+ condenser.RESULT_WORD_ENTROPHY,
+ language,
+ doctype,
+ true);
addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), ientry), false);
}
// System.out.println("DEBUG: plasmaSearch.addPageIndex: added " +
// condenser.getWords().size() + " words, flushed " + c + " entries");
return condenser.RESULT_SIMI_WORDS;
}
+
+ public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
+ return ramCache.getContainer(wordHash, deleteIfEmpty, maxTime);
+ }
public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) {
- return ramCache.getIndex(wordHash, deleteIfEmpty, maxTime);
+ return ramCache.getEntity(wordHash, deleteIfEmpty, maxTime);
}
+ public Set getContainers(Set wordHashes, boolean deleteIfEmpty, boolean interruptIfEmpty, long maxTime) {
+
+ // retrieve entities that belong to the hashes
+ HashSet containers = new HashSet();
+ String singleHash;
+ plasmaWordIndexEntryContainer singleContainer;
+ Iterator i = wordHashes.iterator();
+ long start = System.currentTimeMillis();
+ long remaining;
+ while (i.hasNext()) {
+ // check time
+ remaining = maxTime - (System.currentTimeMillis() - start);
+ //if ((maxTime > 0) && (remaining <= 0)) break;
+
+ // get next hash:
+ singleHash = (String) i.next();
+
+ // retrieve index
+ singleContainer = getContainer(singleHash, deleteIfEmpty, (maxTime < 0) ? -1 : remaining / (wordHashes.size() - containers.size()));
+
+ // check result
+ if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashSet();
+
+ containers.add(singleContainer);
+ }
+ return containers;
+ }
+
+ /*
public Set getEntities(Set wordHashes, boolean deleteIfEmpty, boolean interruptIfEmpty, long maxTime) {
// retrieve entities that belong to the hashes
@@ -203,13 +236,14 @@ public final class plasmaWordIndex {
singleEntity = getEntity(singleHash, deleteIfEmpty, (maxTime < 0) ? -1 : remaining / (wordHashes.size() - entities.size()));
// check result
- if (((singleEntity == null) || (singleEntity.size() == 0)) && (interruptIfEmpty)) return null;
+ if (((singleEntity == null) || (singleEntity.size() == 0)) && (interruptIfEmpty)) return new HashSet();
entities.add(singleEntity);
}
return entities;
}
-
+ */
+
public int size() {
return ramCache.size();
}
diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java
index 8572e7b40..e1b054255 100644
--- a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java
+++ b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java
@@ -203,7 +203,7 @@ public final class plasmaWordIndexAssortmentCluster {
}
public plasmaWordIndexEntryContainer removeFromAll(String wordHash, long maxTime) {
- // collect all records from all the assortments and return them
+ // removes all records from all the assortments and return them
plasmaWordIndexEntryContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash);
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
for (int i = 0; i < clusterCount; i++) {
@@ -214,6 +214,18 @@ public final class plasmaWordIndexAssortmentCluster {
return record;
}
+ public plasmaWordIndexEntryContainer getFromAll(String wordHash, long maxTime) {
+ // collect all records from all the assortments and return them
+ plasmaWordIndexEntryContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash);
+ long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
+ for (int i = 0; i < clusterCount; i++) {
+ buffer = assortments[i].get(wordHash);
+ if (buffer != null) record.add(buffer);
+ if (System.currentTimeMillis() > limitTime) break;
+ }
+ return record;
+ }
+
public Iterator hashConjunction(String startWordHash, boolean up, boolean rot) {
HashSet iterators = new HashSet();
//if (rot) System.out.println("WARNING: kelondroMergeIterator does not work correctly when individual iterators rotate on their own!");
diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java
index eaf57a520..4e506fe94 100644
--- a/source/de/anomic/plasma/plasmaWordIndexCache.java
+++ b/source/de/anomic/plasma/plasmaWordIndexCache.java
@@ -391,7 +391,18 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
}
}
- public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty, long maxTime) {
+ public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
+ long start = System.currentTimeMillis();
+ if (maxTime > 0) maxTime = 8 * maxTime / 10; // reserve time for later adding to backend
+ plasmaWordIndexEntryContainer container = assortmentCluster.getFromAll(wordHash, maxTime);
+ if (container == null) {
+ container = new plasmaWordIndexEntryContainer(wordHash);
+ }
+ container.add(backend.getContainer(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : System.currentTimeMillis() - start));
+ return container;
+ }
+
+ public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) {
// this possibly creates an index file in the back-end
// the index file is opened and returned as entity object
long start = System.currentTimeMillis();
@@ -406,7 +417,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
}
}
long r = maxTime - (System.currentTimeMillis() - start);
- return backend.getIndex(wordHash, deleteIfEmpty, (r < 0) ? 0 : r);
+ return backend.getEntity(wordHash, deleteIfEmpty, (r < 0) ? 0 : r);
}
public long getUpdateTime(String wordHash) {
diff --git a/source/de/anomic/plasma/plasmaWordIndexClassicDB.java b/source/de/anomic/plasma/plasmaWordIndexClassicDB.java
index 5714f2038..42088db56 100644
--- a/source/de/anomic/plasma/plasmaWordIndexClassicDB.java
+++ b/source/de/anomic/plasma/plasmaWordIndexClassicDB.java
@@ -181,7 +181,24 @@ public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface {
}
}
- public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty, long maxTime) {
+ public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
+ long start = System.currentTimeMillis();
+ if (plasmaWordIndexEntity.wordHash2path(databaseRoot, wordHash).exists()) {
+ plasmaWordIndexEntity entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10);
+ plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash);
+ plasmaWordIndexEntry entry;
+ Iterator i = entity.elements(true);
+ while ((i.hasNext()) && ((maxTime < 0) || (System.currentTimeMillis() < start + maxTime))) {
+ entry = (plasmaWordIndexEntry) i.next();
+ container.add(entry);
+ }
+ return container;
+ } else {
+ return new plasmaWordIndexEntryContainer(wordHash, 0);
+ }
+ }
+
+ public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) {
return new plasmaWordIndexEntity(databaseRoot, wordHash, deleteIfEmpty);
}
@@ -190,7 +207,6 @@ public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface {
if (f.exists()) return f.lastModified(); else return -1;
}
-
public void deleteIndex(String wordHash) {
plasmaWordIndexEntity.removePlasmaIndex(databaseRoot, wordHash);
}
@@ -200,7 +216,7 @@ public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface {
plasmaWordIndexEntity pi = null;
int count = 0;
try {
- pi = getIndex(wordHash, true, -1);
+ pi = getEntity(wordHash, true, -1);
for (int i = 0; i < urlHashes.length; i++)
if (pi.removeEntry(urlHashes[i], deleteComplete)) count++;
int size = pi.size();
diff --git a/source/de/anomic/plasma/plasmaWordIndexDistribution.java b/source/de/anomic/plasma/plasmaWordIndexDistribution.java
index 4ed88dbdf..78e0dcceb 100644
--- a/source/de/anomic/plasma/plasmaWordIndexDistribution.java
+++ b/source/de/anomic/plasma/plasmaWordIndexDistribution.java
@@ -201,33 +201,33 @@ public final class plasmaWordIndexDistribution {
// collect index
String startPointHash = selectTransferStart();
log.logFine("Selected hash " + startPointHash + " as start point for index distribution, distance = " + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, startPointHash));
- Object[] selectResult = selectTransferIndexes(startPointHash, indexCount, this.maxOpenFiles4Distribution);
- plasmaWordIndexEntity[] indexEntities = (plasmaWordIndexEntity[]) selectResult[0];
+ Object[] selectResult = selectTransferContainers(startPointHash, indexCount, this.maxOpenFiles4Distribution);
+ plasmaWordIndexEntryContainer[] indexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0];
//Integer openedFiles = (Integer) selectResult[2];
HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry
- if ((indexEntities == null) || (indexEntities.length == 0)) {
+ if ((indexContainers == null) || (indexContainers.length == 0)) {
log.logFine("No index available for index transfer, hash start-point " + startPointHash);
return -1;
}
// count the indexes again, can be smaller as expected
indexCount = 0;
- for (int i = 0; i < indexEntities.length; i++) {
- indexCount += indexEntities[i].size();
+ for (int i = 0; i < indexContainers.length; i++) {
+ indexCount += indexContainers[i].size();
}
if (indexCount < 50) {
log.logFine("Too few (" + indexCount + ") indexes selected for transfer.");
- closeTransferIndexes (indexEntities);
+ closeTransferIndexes(indexContainers);
return -1; // failed
}
// find start point for DHT-selection
- String keyhash = indexEntities[indexEntities.length - 1].wordHash(); // DHT targets must have greater hashes
+ String keyhash = indexContainers[indexContainers.length - 1].wordHash(); // DHT targets must have greater hashes
// find a list of DHT-peers
yacySeed[] seeds = new yacySeed[peerCount + 10];
int hc0 = 0;
- double ownDistance = Math.min(yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexEntities[0].wordHash()),
- yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexEntities[indexEntities.length - 1].wordHash()));
+ double ownDistance = Math.min(yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexContainers[0].wordHash()),
+ yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexContainers[indexContainers.length - 1].wordHash()));
double maxDistance = Math.min(ownDistance, 0.4);
synchronized (yacyCore.dhtAgent) {
double avdist;
@@ -239,8 +239,8 @@ public final class plasmaWordIndexDistribution {
}
seeds[hc0] = (yacySeed) e.nextElement();
if (seeds[hc0] != null) {
- avdist = Math.max(yacyDHTAction.dhtDistance(seeds[hc0].hash, indexEntities[0].wordHash()),
- yacyDHTAction.dhtDistance(seeds[hc0].hash, indexEntities[indexEntities.length - 1].wordHash()));
+ avdist = Math.max(yacyDHTAction.dhtDistance(seeds[hc0].hash, indexContainers[0].wordHash()),
+ yacyDHTAction.dhtDistance(seeds[hc0].hash, indexContainers[indexContainers.length - 1].wordHash()));
if (avdist < maxDistance) {
log.logInfo("Selected " + ((hc0 < peerCount) ? "primary" : "reserve") + " DHT target peer " + seeds[hc0].getName() + ":" + seeds[hc0].hash + ", distance = " + avdist);
hc0++;
@@ -252,7 +252,7 @@ public final class plasmaWordIndexDistribution {
if (hc0 < peerCount) {
log.logWarning("found not enough (" + hc0 + ") peers for distribution");
- closeTransferIndexes (indexEntities);
+ closeTransferIndexes(indexContainers);
return -1; // failed
}
@@ -267,9 +267,9 @@ public final class plasmaWordIndexDistribution {
return -1; // interrupted
}
start = System.currentTimeMillis();
- error = yacyClient.transferIndex(seeds[i], indexEntities, urlCache, this.gzipBody4Distribution, this.timeout4Distribution);
+ error = yacyClient.transferIndex(seeds[i], indexContainers, urlCache, this.gzipBody4Distribution, this.timeout4Distribution);
if (error == null) {
- log.logInfo("Index transfer of " + indexCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length - 1].wordHash() + "] to peer " + seeds[i].getName() + ":" + seeds[i].hash + " in " + ((System.currentTimeMillis() - start) / 1000)
+ log.logInfo("Index transfer of " + indexCount + " words [" + indexContainers[0].wordHash() + " .. " + indexContainers[indexContainers.length - 1].wordHash() + "] to peer " + seeds[i].getName() + ":" + seeds[i].hash + " in " + ((System.currentTimeMillis() - start) / 1000)
+ " seconds successfull (" + (1000 * indexCount / (System.currentTimeMillis() - start + 1)) + " words/s)");
peerNames += ", " + seeds[i].getName();
hc1++;
@@ -286,8 +286,8 @@ public final class plasmaWordIndexDistribution {
// success
if (delete) {
try {
- if (deleteTransferIndexes(indexEntities)) {
- log.logFine("Deleted all " + indexEntities.length + " transferred whole-word indexes locally");
+ if (deleteTransferIndexes(indexContainers)) {
+ log.logFine("Deleted all " + indexContainers.length + " transferred whole-word indexes locally");
return indexCount;
} else {
log.logSevere("Deleted not all transferred whole-word indexes");
@@ -299,13 +299,13 @@ public final class plasmaWordIndexDistribution {
}
} else {
// simply close the indexEntities
- closeTransferIndexes (indexEntities);
+ closeTransferIndexes(indexContainers);
}
return indexCount;
} else {
log.logSevere("Index distribution failed. Too few peers (" + hc1 + ") received the index, not deleted locally.");
// simply close the indexEntities
- closeTransferIndexes (indexEntities);
+ closeTransferIndexes(indexContainers);
return -1;
}
}
@@ -322,15 +322,16 @@ public final class plasmaWordIndexDistribution {
return startPointHash;
}
- Object[] /* of {plasmaWordIndexEntity[], HashMap(String, plasmaCrawlLURL.Entry)}*/
- selectTransferIndexes(String hash, int count, int maxOpenFiles) {
+ Object[] /* of {plasmaWordIndexEntryContainer[], HashMap(String, plasmaCrawlLURL.Entry)}*/
+ selectTransferContainers(String hash, int count, int maxOpenFiles) {
// the hash is a start hash from where the indexes are picked
- ArrayList tmpEntities = new ArrayList(count);
+ ArrayList tmpContainers = new ArrayList(count);
String nexthash = "";
try {
int currOpenFiles = 0;
Iterator wordHashIterator = this.wordIndex.wordHashes(hash, true, true);
- plasmaWordIndexEntity indexEntity, tmpEntity;
+ plasmaWordIndexEntity indexEntity;
+ plasmaWordIndexEntryContainer indexContainer;
Iterator urlIter;
Iterator hashIter;
plasmaWordIndexEntry indexEntry;
@@ -343,56 +344,15 @@ public final class plasmaWordIndexDistribution {
(wordHashIterator.hasNext()) &&
((nexthash = (String) wordHashIterator.next()) != null) &&
(nexthash.trim().length() > 0) &&
- ((currOpenFiles == 0) || (yacyDHTAction.dhtDistance(nexthash,
- ((plasmaWordIndexEntity)tmpEntities.get(0)).wordHash()) < 0.2))
+ ((currOpenFiles == 0) ||
+ (yacyDHTAction.dhtDistance(nexthash, ((plasmaWordIndexEntity)tmpContainers.get(0)).wordHash()) < 0.2))
) {
indexEntity = this.wordIndex.getEntity(nexthash, true, -1);
if (indexEntity.size() == 0) {
indexEntity.deleteComplete();
- } else if ((indexEntity.size() <= count)|| // if we havn't exceeded the limit
- (Math.abs(indexEntity.size() - count) <= 10)){ // or there are only at most 10 entries left
- // take the whole entity
- try {
- // fist check if we know all urls
- urlIter = indexEntity.elements(true);
- unknownURLEntries.clear();
- while (urlIter.hasNext()) {
- indexEntry = (plasmaWordIndexEntry) urlIter.next();
- try {
- lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), indexEntry);
- if ((lurl == null) || (lurl.url() == null)) {
- unknownURLEntries.add(indexEntry.getUrlHash());
- } else {
- knownURLs.put(indexEntry.getUrlHash(), lurl);
- }
- } catch (IOException e) {
- unknownURLEntries.add(indexEntry.getUrlHash());
- }
- }
- // now delete all entries that have no url entry
- hashIter = unknownURLEntries.iterator();
- while (hashIter.hasNext()) {
- String nextUrlHash = (String) hashIter.next();
- indexEntity.removeEntry(nextUrlHash, false);
- this.urlPool.loadedURL.remove(nextUrlHash);
- }
-
- if (indexEntity.size() == 0) {
- indexEntity.deleteComplete();
- } else {
- // use whats remaining
- tmpEntities.add(indexEntity);
- this.log.logFine("Selected whole index (" + indexEntity.size() + " URLs, " + unknownURLEntries.size() + " not bound) for word " + indexEntity.wordHash());
- count -= indexEntity.size();
- currOpenFiles++;
- }
- } catch (kelondroException e) {
- this.log.logSevere("plasmaWordIndexDistribution/1: deleted DB for word " + indexEntity.wordHash(), e);
- indexEntity.deleteComplete();
- }
} else {
// make an on-the-fly entity and insert values
- tmpEntity = new plasmaWordIndexEntity(indexEntity.wordHash());
+ indexContainer = new plasmaWordIndexEntryContainer(indexEntity.wordHash());
try {
urlIter = indexEntity.elements(true);
unknownURLEntries.clear();
@@ -404,7 +364,7 @@ public final class plasmaWordIndexDistribution {
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
knownURLs.put(indexEntry.getUrlHash(), lurl);
- tmpEntity.addEntry(indexEntry);
+ indexContainer.add(indexEntry);
count--;
}
} catch (IOException e) {
@@ -426,8 +386,8 @@ public final class plasmaWordIndexDistribution {
}
// use whats remaining
- this.log.logFine("Selected partial index (" + tmpEntity.size() + " from " + indexEntity.size() +" URLs, " + unknownURLEntries.size() + " not bound) for word " + tmpEntity.wordHash());
- tmpEntities.add(tmpEntity);
+ this.log.logFine("Selected partial index (" + indexContainer.size() + " from " + indexEntity.size() +" URLs, " + unknownURLEntries.size() + " not bound) for word " + indexContainer.wordHash());
+ tmpContainers.add(indexContainer);
} catch (kelondroException e) {
this.log.logSevere("plasmaWordIndexDistribution/2: deleted DB for word " + indexEntity.wordHash(), e);
indexEntity.deleteComplete();
@@ -438,8 +398,8 @@ public final class plasmaWordIndexDistribution {
}
// transfer to array
- plasmaWordIndexEntity[] indexEntities = (plasmaWordIndexEntity[]) tmpEntities.toArray(new plasmaWordIndexEntity[tmpEntities.size()]);
- return new Object[]{indexEntities, knownURLs, new Integer(currOpenFiles)};
+ plasmaWordIndexEntryContainer[] entryContainers = (plasmaWordIndexEntryContainer[]) tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]);
+ return new Object[]{entryContainers, knownURLs, new Integer(currOpenFiles)};
} catch (IOException e) {
this.log.logSevere("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage(), e);
return new Object[]{new plasmaWordIndexEntity[0], new HashMap(0)};
@@ -477,6 +437,40 @@ public final class plasmaWordIndexDistribution {
} catch (IOException ee) {}
}
+ void closeTransferIndexes(plasmaWordIndexEntryContainer[] indexContainers) {
+ for (int i = 0; i < indexContainers.length; i++) {
+ indexContainers[i] = null;
+ }
+ }
+
+ boolean deleteTransferIndexes(plasmaWordIndexEntryContainer[] indexContainers) throws IOException {
+ Iterator urlIter;
+ plasmaWordIndexEntry indexEntry;
+ plasmaWordIndexEntity indexEntity;
+ String[] urlHashes;
+ int sz;
+ boolean success = true;
+ for (int i = 0; i < indexContainers.length; i++) {
+ // delete entries separately
+ int c = 0;
+ urlHashes = new String[indexContainers[i].size()];
+ urlIter = indexContainers[i].entries();
+ while (urlIter.hasNext()) {
+ indexEntry = (plasmaWordIndexEntry) urlIter.next();
+ urlHashes[c++] = indexEntry.getUrlHash();
+ }
+ wordIndex.removeEntries(indexContainers[i].wordHash(), urlHashes, true);
+ indexEntity = wordIndex.getEntity(indexContainers[i].wordHash(), true, -1);
+ sz = indexEntity.size();
+ // indexEntity.close();
+ closeTransferIndex(indexEntity);
+ log.logFine("Deleted partial index (" + c + " URLs) for word " + indexContainers[i].wordHash() + "; " + sz + " entries left");
+ indexContainers[i] = null;
+ }
+ return success;
+ }
+
+/*
boolean deleteTransferIndexes(plasmaWordIndexEntity[] indexEntities) throws IOException {
Iterator urlIter;
plasmaWordIndexEntry indexEntry;
@@ -500,13 +494,6 @@ public final class plasmaWordIndexDistribution {
// indexEntity.close();
closeTransferIndex(indexEntity);
log.logFine("Deleted partial index (" + c + " URLs) for word " + indexEntities[i].wordHash() + "; " + sz + " entries left");
- // DEBUG: now try to delete the remaining index. If this works, this routine is fine
- /*
- if (wordIndex.getEntity(indexEntities[i].wordHash()).deleteComplete())
- System.out.println("DEBUG: trial delete of partial word index " + indexEntities[i].wordHash() + " SUCCESSFULL");
- else
- System.out.println("DEBUG: trial delete of partial word index " + indexEntities[i].wordHash() + " FAILED");
- */
// end debug
indexEntities[i].close();
} else {
@@ -516,7 +503,7 @@ public final class plasmaWordIndexDistribution {
} else {
indexEntities[i].close();
// have another try...
- if (!(plasmaWordIndexEntity.wordHash2path(wordIndex.getRoot() /*PLASMADB*/, indexEntities[i].wordHash()).delete())) {
+ if (!(plasmaWordIndexEntity.wordHash2path(wordIndex.getRoot(), indexEntities[i].wordHash()).delete())) {
success = false;
log.logSevere("Could not delete whole index for word " + indexEntities[i].wordHash());
}
@@ -526,7 +513,8 @@ public final class plasmaWordIndexDistribution {
}
return success;
}
-
+ */
+
public void startTransferWholeIndex(yacySeed seed, boolean delete) {
if (transferIdxThread == null) {
this.transferIdxThread = new transferIndexThread(seed,delete);
@@ -573,14 +561,14 @@ public final class plasmaWordIndexDistribution {
// word chunk
private String endPointHash;
private String startPointHash;
- plasmaWordIndexEntity[] indexEntities;
+ plasmaWordIndexEntryContainer[] indexContainers;
// other fields
HashMap urlCache;
public transferIndexWorkerThread(
yacySeed seed,
- plasmaWordIndexEntity[] indexEntities,
+ plasmaWordIndexEntryContainer[] indexContainers,
HashMap urlCache,
boolean gzipBody,
int timeout,
@@ -594,7 +582,7 @@ public final class plasmaWordIndexDistribution {
this.timeout4Transfer = timeout;
this.iteration = iteration;
this.seed = seed;
- this.indexEntities = indexEntities;
+ this.indexContainers = indexContainers;
this.urlCache = urlCache;
this.idxCount = idxCount;
this.chunkSize = chunkSize;
@@ -657,11 +645,11 @@ public final class plasmaWordIndexDistribution {
// transfering seleted words to remote peer
this.status = "Running: Transfering chunk " + iteration;
- String error = yacyClient.transferIndex(seed, indexEntities, urlCache, gzipBody4Transfer, timeout4Transfer);
+ String error = yacyClient.transferIndex(seed, indexContainers, urlCache, gzipBody4Transfer, timeout4Transfer);
if (error == null) {
// words successfully transfered
transferTime = System.currentTimeMillis() - start;
- plasmaWordIndexDistribution.this.log.logInfo("Index transfer of " + idxCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length-1].wordHash() + "]" +
+ plasmaWordIndexDistribution.this.log.logInfo("Index transfer of " + idxCount + " words [" + indexContainers[0].wordHash() + " .. " + indexContainers[indexContainers.length-1].wordHash() + "]" +
" to peer " + seed.getName() + ":" + seed.hash + " in " + (transferTime/1000) + " seconds successfull (" +
(1000 * idxCount / (transferTime + 1)) + " words/s)");
retryCount = 0;
@@ -817,7 +805,7 @@ public final class plasmaWordIndexDistribution {
}
public void performTransferWholeIndex() {
- plasmaWordIndexEntity[] newIndexEntities = null, oldIndexEntities = null;
+ plasmaWordIndexEntryContainer[] newIndexContainers = null, oldIndexContainers = null;
try {
// pausing the regular index distribution
// TODO: adding sync, to wait for a still running index distribution to finish
@@ -838,12 +826,12 @@ public final class plasmaWordIndexDistribution {
iteration++;
int idxCount = 0;
selectionStart = System.currentTimeMillis();
- oldIndexEntities = newIndexEntities;
+ oldIndexContainers = newIndexContainers;
// selecting 500 words to transfer
this.status = "Running: Selecting chunk " + iteration;
- Object[] selectResult = selectTransferIndexes(this.startPointHash, this.chunkSize, this.maxOpenFiles4Transfer - openedFiles.intValue());
- newIndexEntities = (plasmaWordIndexEntity[]) selectResult[0];
+ Object[] selectResult = selectTransferContainers(this.startPointHash, this.chunkSize, this.maxOpenFiles4Transfer - openedFiles.intValue());
+ newIndexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0];
HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry
openedFiles = (Integer) selectResult[2];
@@ -851,7 +839,7 @@ public final class plasmaWordIndexDistribution {
* a) no words are left in the index
* b) max open file limit was exceeded
*/
- if ((newIndexEntities == null) || (newIndexEntities.length == 0)) {
+ if ((newIndexContainers == null) || (newIndexContainers.length == 0)) {
if (sb.wordIndex.size() > 0) {
// if there are still words in the index we try it again now
startPointHash = "------------";
@@ -863,15 +851,15 @@ public final class plasmaWordIndexDistribution {
}
} else {
// count the indexes again, can be smaller as expected
- for (int i = 0; i < newIndexEntities.length; i++) idxCount += newIndexEntities[i].size();
+ for (int i = 0; i < newIndexContainers.length; i++) idxCount += newIndexContainers[i].size();
// getting start point for next DHT-selection
oldStartingPointHash = startPointHash;
- startPointHash = newIndexEntities[newIndexEntities.length - 1].wordHash(); // DHT targets must have greater hashes
+ startPointHash = newIndexContainers[newIndexContainers.length - 1].wordHash(); // DHT targets must have greater hashes
selectionEnd = System.currentTimeMillis();
selectionTime = selectionEnd - selectionStart;
- plasmaWordIndexDistribution.this.log.logInfo("Index selection of " + idxCount + " words [" + newIndexEntities[0].wordHash() + " .. " + newIndexEntities[newIndexEntities.length-1].wordHash() + "]" +
+ plasmaWordIndexDistribution.this.log.logInfo("Index selection of " + idxCount + " words [" + newIndexContainers[0].wordHash() + " .. " + newIndexContainers[newIndexContainers.length-1].wordHash() + "]" +
" in " +
(selectionTime / 1000) + " seconds (" +
(1000 * idxCount / (selectionTime+1)) + " words/s)");
@@ -886,10 +874,10 @@ public final class plasmaWordIndexDistribution {
this.status = "Aborted because of Transfer error:\n" + worker.getStatus();
// cleanup. closing all open files
- closeEntities(oldIndexEntities);
- oldIndexEntities = null;
- closeEntities(newIndexEntities);
- newIndexEntities = null;
+ closeContainers(oldIndexContainers);
+ oldIndexContainers = null;
+ closeContainers(newIndexContainers);
+ newIndexContainers = null;
// abort index transfer
return;
@@ -922,10 +910,10 @@ public final class plasmaWordIndexDistribution {
if (delete) {
this.status = "Running: Deleting chunk " + iteration;
try {
- if (deleteTransferIndexes(oldIndexEntities)) {
- plasmaWordIndexDistribution.this.log.logFine("Deleted all " + oldIndexEntities.length + " transferred whole-word indexes locally");
+ if (deleteTransferIndexes(oldIndexContainers)) {
+ plasmaWordIndexDistribution.this.log.logFine("Deleted all " + oldIndexContainers.length + " transferred whole-word indexes locally");
transferedEntryCount += idxCount;
- transferedEntityCount += oldIndexEntities.length;
+ transferedEntityCount += oldIndexContainers.length;
} else {
plasmaWordIndexDistribution.this.log.logSevere("Deleted not all transferred whole-word indexes");
}
@@ -933,18 +921,18 @@ public final class plasmaWordIndexDistribution {
plasmaWordIndexDistribution.this.log.logSevere("Deletion of indexes not possible:" + ee.getMessage(), ee);
}
} else {
- this.closeEntities(oldIndexEntities);
+ this.closeContainers(oldIndexContainers);
transferedEntryCount += idxCount;
- transferedEntityCount += oldIndexEntities.length;
+ transferedEntityCount += oldIndexContainers.length;
}
- oldIndexEntities = null;
+ oldIndexContainers = null;
}
this.worker = null;
}
// handover chunk to transfer worker
- if (!((newIndexEntities == null) || (newIndexEntities.length == 0))) {
- worker = new transferIndexWorkerThread(seed,newIndexEntities,urlCache,gzipBody4Transfer,timeout4Transfer,iteration,idxCount,idxCount,startPointHash,oldStartingPointHash);
+ if (!((newIndexContainers == null) || (newIndexContainers.length == 0))) {
+ worker = new transferIndexWorkerThread(seed,newIndexContainers,urlCache,gzipBody4Transfer,timeout4Transfer,iteration,idxCount,idxCount,startPointHash,oldStartingPointHash);
worker.start();
}
}
@@ -961,30 +949,21 @@ public final class plasmaWordIndexDistribution {
try {worker.join();}catch(Exception e){}
// worker = null;
}
- if (oldIndexEntities != null) closeEntities(oldIndexEntities);
- if (newIndexEntities != null) closeEntities(newIndexEntities);
+ if (oldIndexContainers != null) closeContainers(oldIndexContainers);
+ if (newIndexContainers != null) closeContainers(newIndexContainers);
plasmaWordIndexDistribution.this.paused = false;
}
}
- private void closeEntities(plasmaWordIndexEntity[] indexEntities) {
- if ((indexEntities == null)||(indexEntities.length ==0)) return;
+ private void closeContainers(plasmaWordIndexEntryContainer[] indexContainers) {
+ if ((indexContainers == null)||(indexContainers.length ==0)) return;
- for (int i = 0; i < indexEntities.length; i++) try {
- indexEntities[i].close();
- } catch (IOException ee) {}
- }
-
- /*
- private boolean isAborted() {
- if (finished || Thread.currentThread().isInterrupted()) {
- this.status = "aborted";
- return true;
- }
- return false;
+ for (int i = 0; i < indexContainers.length; i++) {
+ indexContainers[i] = null;
+ }
}
- */
+
}
}
diff --git a/source/de/anomic/plasma/plasmaWordIndexEntity.java b/source/de/anomic/plasma/plasmaWordIndexEntity.java
index 94ee53522..723cf82a7 100644
--- a/source/de/anomic/plasma/plasmaWordIndexEntity.java
+++ b/source/de/anomic/plasma/plasmaWordIndexEntity.java
@@ -48,7 +48,6 @@ import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.TreeMap;
-import java.util.Set;
import de.anomic.kelondro.kelondroRecords;
import de.anomic.kelondro.kelondroTree;
import de.anomic.kelondro.kelondroException;
@@ -111,6 +110,7 @@ public final class plasmaWordIndexEntity {
hash.substring(4,6) + "/" + hash + ".db");
}
+ /*
public plasmaWordIndexEntity(String wordHash) {
// this creates a nameless temporary index. It is needed for combined search
// and used to hold the intersection of two indexes
@@ -121,7 +121,7 @@ public final class plasmaWordIndexEntity {
theLocation = null;
theTmpMap = new TreeMap();
}
-
+*/
public boolean isTMPEntity() {
return theTmpMap != null;
}
@@ -302,12 +302,6 @@ public final class plasmaWordIndexEntity {
else return "EMPTY";
}
- // join methods
- private static int log2(int x) {
- int l = 0;
- while (x > 0) {x = x >> 1; l++;}
- return l;
- }
public void merge(plasmaWordIndexEntity otherEntity, long time) throws IOException {
// this is a merge of another entity to this entity
@@ -324,6 +318,14 @@ public final class plasmaWordIndexEntity {
}
}
+ /*
+ // join methods
+ private static int log2(int x) {
+ int l = 0;
+ while (x > 0) {x = x >> 1; l++;}
+ return l;
+ }
+
public static plasmaWordIndexEntity joinEntities(Set entities, long time) throws IOException {
// big problem here: there cannot be a time-out for join, since a time-out will leave the joined set too big.
@@ -485,5 +487,5 @@ public final class plasmaWordIndexEntity {
}
return conj;
}
-
+*/
}
\ No newline at end of file
diff --git a/source/de/anomic/plasma/plasmaWordIndexEntry.java b/source/de/anomic/plasma/plasmaWordIndexEntry.java
index e65d3c136..9cec2ac34 100644
--- a/source/de/anomic/plasma/plasmaWordIndexEntry.java
+++ b/source/de/anomic/plasma/plasmaWordIndexEntry.java
@@ -100,18 +100,21 @@ public final class plasmaWordIndexEntry {
public static final char DT_UNKNOWN = 'u';
// appearance locations: (used for flags)
- public static final int AP_TITLE = 0; // title tag from html header
- public static final int AP_H1 = 1; // h1-tag
- public static final int AP_H2 = 2; // h2-tag
- public static final int AP_H3 = 3; // h3-tag
- public static final int AP_H4 = 4; // h4-tag
- public static final int AP_H5 = 5; // h5-tag
- public static final int AP_H6 = 6; // h6-tag
- public static final int AP_TEXT = 7; // word appears in text (used to check validation of other appearances against spam)
- public static final int AP_URL = 8; // word inside an url
- public static final int AP_IMG = 9; // tag inside image references
- public static final int AP_TAG = 10; // for tagged indexeing (i.e. using mp3 tags)
- public static final int AP_ANCHOR = 11; // anchor description
+ public static final int AP_TITLE = 0; // title tag from html header
+ public static final int AP_H1 = 1; // h1-tag
+ public static final int AP_H2 = 2; // h2-tag
+ public static final int AP_H3 = 3; // h3-tag
+ public static final int AP_H4 = 4; // h4-tag
+ public static final int AP_H5 = 5; // h5-tag
+ public static final int AP_H6 = 6; // h6-tag
+ public static final int AP_TEXT = 7; // word appears in text (used to check validation of other appearances against spam)
+ public static final int AP_URL = 8; // word inside an url
+ public static final int AP_IMG = 9; // tag inside image references
+ public static final int AP_TAG = 10; // for tagged indexeing (i.e. using mp3 tags)
+ public static final int AP_ANCHOR = 11; // anchor description
+ public static final int AP_BOLD = 12;
+ public static final int AP_ITALICS = 13;
+ public static final int AP_INVISIBLE = 14; // good for spam detection
// URL attributes
public static final int UA_LOCAL = 0; // URL was crawled locally
@@ -208,6 +211,8 @@ public final class plasmaWordIndexEntry {
// the class instantiation can only be done by a plasmaStore method
// therefore they are all public
public plasmaWordIndexEntry(String urlHash,
+ int urlLength, // byte-length of complete URL
+ int urlComps, // number of path components
int hitcount, //*how often appears this word in the text
int wordcount, //*total number of words
int phrasecount, //*total number of phrases
@@ -227,14 +232,9 @@ public final class plasmaWordIndexEntry {
// more needed attributes:
// - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag etc
// - boolean: URL attributes
- // - int: url-length (shorter are better)
- // - int: url-number of components / length of path
// - int: length of description tag / title tag (longer are better)
- // - int: number of chapters
// - int: # of outlinks to same domain
// - int: # of outlinks to outside domain
- // - int: length of description
- // - int: length of title
// - int: # of keywords
if ((language == null) || (language.length() != plasmaURL.urlLanguageLength)) language = "uk";
diff --git a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java
index 458ada0db..2737d5664 100644
--- a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java
+++ b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java
@@ -54,12 +54,14 @@ package de.anomic.plasma;
import java.util.HashMap;
import java.util.Iterator;
+import java.util.Set;
+import java.util.TreeMap;
import de.anomic.kelondro.kelondroBase64Order;
public final class plasmaWordIndexEntryContainer implements Comparable {
- private final String wordHash;
+ private String wordHash;
private final HashMap container; // urlHash/plasmaWordIndexEntry - Mapping
private long updateTime;
@@ -73,6 +75,15 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
container = new HashMap(initContainerSize); // a urlhash/plasmaWordIndexEntry - relation
}
+ public void setWordHash(String newWordHash) {
+ // this is used to replicate a container for different word indexes during global search
+ this.wordHash = newWordHash;
+ }
+
+ public void clear() {
+ container.clear();
+ }
+
public int size() {
return container.size();
}
@@ -85,14 +96,18 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
return wordHash;
}
+ public int add(plasmaWordIndexEntry entry) {
+ return add(entry, System.currentTimeMillis());
+ }
+
public int add(plasmaWordIndexEntry entry, long updateTime) {
this.updateTime = java.lang.Math.max(this.updateTime, updateTime);
- return (add(entry)) ? 1 : 0;
+ return (addi(entry)) ? 1 : 0;
}
public int add(plasmaWordIndexEntry[] entries, long updateTime) {
int c = 0;
- for (int i = 0; i < entries.length; i++) if (add(entries[i])) c++;
+ for (int i = 0; i < entries.length; i++) if (addi(entries[i])) c++;
this.updateTime = java.lang.Math.max(this.updateTime, updateTime);
return c;
}
@@ -102,13 +117,13 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
Iterator i = c.entries();
int x = 0;
while (i.hasNext()) {
- if (add((plasmaWordIndexEntry) i.next())) x++;
+ if (addi((plasmaWordIndexEntry) i.next())) x++;
}
this.updateTime = java.lang.Math.max(this.updateTime, c.updateTime);
return x;
}
- private boolean add(plasmaWordIndexEntry entry) {
+ private boolean addi(plasmaWordIndexEntry entry) {
// returns true if the new entry was added, false if it already existet
return (container.put(entry.getUrlHash(), entry) == null);
}
@@ -117,10 +132,18 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
return container.containsKey(urlHash);
}
+ public plasmaWordIndexEntry get(String urlHash) {
+ return (plasmaWordIndexEntry) container.get(urlHash);
+ }
+
public plasmaWordIndexEntry[] getEntryArray() {
return (plasmaWordIndexEntry[]) container.values().toArray();
}
+ public plasmaWordIndexEntry remove(String urlHash) {
+ return (plasmaWordIndexEntry) container.remove(urlHash);
+ }
+
public Iterator entries() {
// returns an iterator of plasmaWordIndexEntry objects
return container.values().iterator();
@@ -146,4 +169,126 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
return (int) kelondroBase64Order.enhancedCoder.decodeLong(this.wordHash.substring(0, 4));
}
+ public static plasmaWordIndexEntryContainer joinContainer(Set containers, long time, int maxDistance) {
+
+ long stamp = System.currentTimeMillis();
+
+ // order entities by their size
+ TreeMap map = new TreeMap();
+ plasmaWordIndexEntryContainer singleContainer;
+ Iterator i = containers.iterator();
+ int count = 0;
+ while (i.hasNext()) {
+ // get next entity:
+ singleContainer = (plasmaWordIndexEntryContainer) i.next();
+
+ // check result
+ if ((singleContainer == null) || (singleContainer.size() == 0)) return new plasmaWordIndexEntryContainer(null); // as this is a cunjunction of searches, we have no result if any word is not known
+
+ // store result in order of result size
+ map.put(new Long(singleContainer.size() * 1000 + count), singleContainer);
+ count++;
+ }
+
+ // check if there is any result
+ if (map.size() == 0) return new plasmaWordIndexEntryContainer(null); // no result, nothing found
+
+ // the map now holds the search results in order of number of hits per word
+ // we now must pairwise build up a conjunction of these sets
+ Long k = (Long) map.firstKey(); // the smallest, which means, the one with the least entries
+ plasmaWordIndexEntryContainer searchA, searchB, searchResult = (plasmaWordIndexEntryContainer) map.remove(k);
+ while ((map.size() > 0) && (searchResult.size() > 0)) {
+ // take the first element of map which is a result and combine it with result
+ k = (Long) map.firstKey(); // the next smallest...
+ time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis();
+ searchA = searchResult;
+ searchB = (plasmaWordIndexEntryContainer) map.remove(k);
+ searchResult = plasmaWordIndexEntryContainer.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1), maxDistance);
+ // free resources
+ searchA = null;
+ searchB = null;
+ }
+
+ // in 'searchResult' is now the combined search result
+ if (searchResult.size() == 0) return new plasmaWordIndexEntryContainer(null);
+ return searchResult;
+ }
+
+ // join methods
+ private static int log2(int x) {
+ int l = 0;
+ while (x > 0) {x = x >> 1; l++;}
+ return l;
+ }
+
+ public static plasmaWordIndexEntryContainer joinConstructive(plasmaWordIndexEntryContainer i1, plasmaWordIndexEntryContainer i2, long time, int maxDistance) {
+ if ((i1 == null) || (i2 == null)) return null;
+ if ((i1.size() == 0) || (i2.size() == 0)) return new plasmaWordIndexEntryContainer(null);
+
+ // decide which method to use
+ int high = ((i1.size() > i2.size()) ? i1.size() : i2.size());
+ int low = ((i1.size() > i2.size()) ? i2.size() : i1.size());
+ int stepsEnum = 10 * (high + low - 1);
+ int stepsTest = 12 * log2(high) * low;
+
+ // start most efficient method
+ if (stepsEnum > stepsTest) {
+ if (i1.size() < i2.size())
+ return joinConstructiveByTest(i1, i2, time, maxDistance);
+ else
+ return joinConstructiveByTest(i2, i1, time, maxDistance);
+ } else {
+ return joinConstructiveByEnumeration(i1, i2, time, maxDistance);
+ }
+ }
+
+ private static plasmaWordIndexEntryContainer joinConstructiveByTest(plasmaWordIndexEntryContainer small, plasmaWordIndexEntryContainer large, long time, int maxDistance) {
+ System.out.println("DEBUG: JOIN METHOD BY TEST");
+ plasmaWordIndexEntryContainer conj = new plasmaWordIndexEntryContainer(null); // start with empty search result
+ Iterator se = small.entries();
+ plasmaWordIndexEntry ie0, ie1;
+ long stamp = System.currentTimeMillis();
+ while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) {
+ ie0 = (plasmaWordIndexEntry) se.next();
+ ie1 = large.get(ie0.getUrlHash());
+ if (ie1 != null) {
+ // this is a hit. Calculate word distance:
+ ie0.combineDistance(ie1);
+ if (ie0.worddistance() <= maxDistance) conj.add(ie0);
+ }
+ }
+ return conj;
+ }
+
+ private static plasmaWordIndexEntryContainer joinConstructiveByEnumeration(plasmaWordIndexEntryContainer i1, plasmaWordIndexEntryContainer i2, long time, int maxDistance) {
+ System.out.println("DEBUG: JOIN METHOD BY ENUMERATION");
+ plasmaWordIndexEntryContainer conj = new plasmaWordIndexEntryContainer(null); // start with empty search result
+ Iterator e1 = i1.entries();
+ Iterator e2 = i2.entries();
+ int c;
+ if ((e1.hasNext()) && (e2.hasNext())) {
+ plasmaWordIndexEntry ie1;
+ plasmaWordIndexEntry ie2;
+ ie1 = (plasmaWordIndexEntry) e1.next();
+ ie2 = (plasmaWordIndexEntry) e2.next();
+
+ long stamp = System.currentTimeMillis();
+ while ((System.currentTimeMillis() - stamp) < time) {
+ c = ie1.getUrlHash().compareTo(ie2.getUrlHash());
+ if (c < 0) {
+ if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break;
+ } else if (c > 0) {
+ if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break;
+ } else {
+ // we have found the same urls in different searches!
+ ie1.combineDistance(ie2);
+ if (ie1.worddistance() <= maxDistance) conj.add(ie1);
+ if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break;
+ if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break;
+ }
+ }
+ }
+ return conj;
+ }
+
}
diff --git a/source/de/anomic/plasma/plasmaWordIndexInterface.java b/source/de/anomic/plasma/plasmaWordIndexInterface.java
index 078518d2a..2026d8f59 100644
--- a/source/de/anomic/plasma/plasmaWordIndexInterface.java
+++ b/source/de/anomic/plasma/plasmaWordIndexInterface.java
@@ -50,7 +50,8 @@ public interface plasmaWordIndexInterface {
public Iterator wordHashes(String startWordHash, boolean up);
- public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty, long maxTime);
+ public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime);
+ public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime);
public long getUpdateTime(String wordHash);
public void deleteIndex(String wordHash);
diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java
index e6f6d05e2..76d550912 100644
--- a/source/de/anomic/yacy/yacyClient.java
+++ b/source/de/anomic/yacy/yacyClient.java
@@ -50,12 +50,13 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
+
+import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpc;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
-import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.plasma.plasmaURLPattern;
@@ -348,14 +349,15 @@ public final class yacyClient {
}
public static int search(
- String wordhashes,
+ String wordhashes,
+ int maxDistance,
boolean global,
yacySeed targetPeer,
- plasmaCrawlLURL urlManager,
- plasmaWordIndexEntity entityCache,
- plasmaURLPattern blacklist,
- plasmaSnippetCache snippets,
- plasmaSearchProfile profile
+ plasmaCrawlLURL urlManager,
+ plasmaWordIndexEntryContainer containerCache,
+ plasmaURLPattern blacklist,
+ plasmaSnippetCache snippets,
+ plasmaSearchProfile profile
) {
// send a search request to peer with remote Hash
// this mainly converts the words into word hashes
@@ -403,6 +405,7 @@ public final class yacyClient {
obj.put("ttl", "0");
obj.put("duetime", Long.toString(duetime));
obj.put("profile", profile.targetToString()); // new duetimes splitted by specific search tasks
+ obj.put("maxdist", maxDistance);
obj.put(yacySeed.MYTIME, yacyCore.universalDateShortString(new Date()));
//yacyCore.log.logDebug("yacyClient.search url=" + url);
@@ -460,6 +463,9 @@ public final class yacyClient {
// get one single search result
urlEntry = urlManager.newEntry((String) result.get("resource" + n), true);
if (urlEntry != null && blacklist.isListed(urlEntry.url().getHost().toLowerCase(), urlEntry.url().getPath())) { continue; } // block with backlist
+ int urlLength = urlEntry.url().toString().length();
+ int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length;
+
urlManager.addEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
// save the url entry
final plasmaWordIndexEntry entry;
@@ -467,6 +473,7 @@ public final class yacyClient {
// the old way to define words
entry = new plasmaWordIndexEntry(
urlEntry.hash(),
+ urlLength, urlComps,
urlEntry.wordCount(),
0, 0, 0, 0, 0, 0,
urlEntry.size(),
@@ -494,7 +501,7 @@ public final class yacyClient {
}
// finally insert the containers to the index
- for (int m = 0; m < words; m++) { entityCache.addEntries(container[m]); }
+ for (int m = 0; m < words; m++) { containerCache.add(container[m]); }
// generate statistics
long searchtime;
@@ -841,7 +848,7 @@ public final class yacyClient {
httpHeader requestHeader) throws IOException {
*/
- public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntity[] indexes, HashMap urlCache, boolean gzipBody, int timeout) {
+ public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntryContainer[] indexes, HashMap urlCache, boolean gzipBody, int timeout) {
HashMap in = transferRWI(targetSeed, indexes, gzipBody, timeout);
if (in == null) { return "no_connection_1"; }
@@ -875,7 +882,7 @@ public final class yacyClient {
return null;
}
- private static HashMap transferRWI(yacySeed targetSeed, plasmaWordIndexEntity[] indexes, boolean gzipBody, int timeout) {
+ private static HashMap transferRWI(yacySeed targetSeed, plasmaWordIndexEntryContainer[] indexes, boolean gzipBody, int timeout) {
final String address = targetSeed.getAddress();
if (address == null) { return null; }
@@ -903,7 +910,7 @@ public final class yacyClient {
Iterator eenum;
plasmaWordIndexEntry entry;
for (int i = 0; i < indexes.length; i++) {
- eenum = indexes[i].elements(true);
+ eenum = indexes[i].entries();
while (eenum.hasNext()) {
entry = (plasmaWordIndexEntry) eenum.next();
entrypost.append(indexes[i].wordHash())
diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java
index cb72dca6c..2e82cd24c 100644
--- a/source/de/anomic/yacy/yacySearch.java
+++ b/source/de/anomic/yacy/yacySearch.java
@@ -52,8 +52,8 @@ import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaURLPattern;
import de.anomic.plasma.plasmaSnippetCache;
-import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaSearchProfile;
+import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.server.logging.serverLog;
public class yacySearch extends Thread {
@@ -61,29 +61,31 @@ public class yacySearch extends Thread {
final private Set wordhashes;
final private boolean global;
final private plasmaCrawlLURL urlManager;
- final private plasmaWordIndexEntity entityCache;
+ final private plasmaWordIndexEntryContainer containerCache;
final private plasmaURLPattern blacklist;
final private plasmaSnippetCache snippetCache;
final private yacySeed targetPeer;
private int links;
+ private int maxDistance;
final private plasmaSearchProfile profile;
- public yacySearch(Set wordhashes, boolean global, yacySeed targetPeer,
- plasmaCrawlLURL urlManager, plasmaWordIndexEntity entityCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchProfile profile) {
+ public yacySearch(Set wordhashes, int maxDistance, boolean global, yacySeed targetPeer,
+ plasmaCrawlLURL urlManager, plasmaWordIndexEntryContainer containerCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchProfile profile) {
super("yacySearch_" + targetPeer.getName());
this.wordhashes = wordhashes;
this.global = global;
this.urlManager = urlManager;
- this.entityCache = entityCache;
+ this.containerCache = containerCache;
this.blacklist = blacklist;
this.snippetCache = snippetCache;
this.targetPeer = targetPeer;
this.links = -1;
+ this.maxDistance = maxDistance;
this.profile = (plasmaSearchProfile) profile.clone();
}
public void run() {
- this.links = yacyClient.search(set2string(wordhashes), global, targetPeer, urlManager, entityCache, blacklist, snippetCache, profile);
+ this.links = yacyClient.search(set2string(wordhashes), maxDistance, global, targetPeer, urlManager, containerCache, blacklist, snippetCache, profile);
if (links != 0) {
//yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + links + " links for word hash " + wordhashes);
yacyCore.seedDB.mySeed.incRI(links);
@@ -172,7 +174,7 @@ public class yacySearch extends Thread {
return result;
}
- public static yacySearch[] searchHashes(Set wordhashes, plasmaCrawlLURL urlManager, plasmaWordIndexEntity entityCache,
+ public static yacySearch[] searchHashes(Set wordhashes, int maxDist, plasmaCrawlLURL urlManager, plasmaWordIndexEntryContainer containerCache,
int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchProfile profile) {
// check own peer status
if (yacyCore.seedDB.mySeed == null || yacyCore.seedDB.mySeed.getAddress() == null) { return null; }
@@ -185,8 +187,8 @@ public class yacySearch extends Thread {
if (targets == 0) return null;
yacySearch[] searchThreads = new yacySearch[targets];
for (int i = 0; i < targets; i++) {
- searchThreads[i]= new yacySearch(wordhashes, true, targetPeers[i],
- urlManager, entityCache, blacklist, snippetCache, profile);
+ searchThreads[i]= new yacySearch(wordhashes, maxDist, true, targetPeers[i],
+ urlManager, containerCache, blacklist, snippetCache, profile);
searchThreads[i].start();
try {Thread.sleep(20);} catch (InterruptedException e) {}
@@ -216,5 +218,4 @@ public class yacySearch extends Thread {
}
}
-
}