changes towards the new index storage scheme:

- replaced usage of temporary IndexEntity by EntryContainer
- added more attributes to word index
- added exact-string search (using quotes in query)
- disabled writing into WORDS during search; EntryContainers are used instead


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1485 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent c81ad1bf34
commit 03c65742ba

@ -62,6 +62,7 @@ import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL; import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaWordIndexEntity; import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaWordIndexEntry; import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyClient; import de.anomic.yacy.yacyClient;
@ -255,12 +256,12 @@ public class IndexControl_p {
} }
prop.put("urlstring", ""); prop.put("urlstring", "");
prop.put("urlhash", ""); prop.put("urlhash", "");
plasmaWordIndexEntity[] indexes = new plasmaWordIndexEntity[1]; plasmaWordIndexEntryContainer[] indexes = new plasmaWordIndexEntryContainer[1];
String result; String result;
long starttime = System.currentTimeMillis(); long starttime = System.currentTimeMillis();
indexes[0] = switchboard.wordIndex.getEntity(keyhash, true, -1); indexes[0] = switchboard.wordIndex.getContainer(keyhash, true, -1);
// built urlCache // built urlCache
Iterator urlIter = indexes[0].elements(true); Iterator urlIter = indexes[0].entries();
HashMap knownURLs = new HashMap(); HashMap knownURLs = new HashMap();
HashSet unknownURLEntries = new HashSet(); HashSet unknownURLEntries = new HashSet();
plasmaWordIndexEntry indexEntry; plasmaWordIndexEntry indexEntry;
@ -282,9 +283,7 @@ public class IndexControl_p {
// now delete all entries that have no url entry // now delete all entries that have no url entry
Iterator hashIter = unknownURLEntries.iterator(); Iterator hashIter = unknownURLEntries.iterator();
while (hashIter.hasNext()) { while (hashIter.hasNext()) {
try { indexes[0].remove((String) hashIter.next());
indexes[0].removeEntry((String) hashIter.next(), false);
} catch (IOException e) {}
} }
// use whats remaining // use whats remaining
String gzipBody = switchboard.getConfig("indexControl.gzipBody","false"); String gzipBody = switchboard.getConfig("indexControl.gzipBody","false");
@ -296,7 +295,8 @@ public class IndexControl_p {
"true".equalsIgnoreCase(gzipBody), "true".equalsIgnoreCase(gzipBody),
timeout); timeout);
prop.put("result", (result == null) ? ("Successfully transferred " + indexes[0].size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result); prop.put("result", (result == null) ? ("Successfully transferred " + indexes[0].size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result);
try {indexes[0].close();} catch (IOException e) {} indexes[0] = null;
indexes = null;
} }
// generate list // generate list
@ -431,15 +431,15 @@ public class IndexControl_p {
public static String genUrlList(plasmaSwitchboard switchboard, String keyhash, String keystring) { public static String genUrlList(plasmaSwitchboard switchboard, String keyhash, String keystring) {
// search for a word hash and generate a list of url links // search for a word hash and generate a list of url links
plasmaWordIndexEntity index = null; plasmaWordIndexEntryContainer index = null;
try { try {
index = switchboard.wordIndex.getEntity(keyhash, true, -1); index = switchboard.wordIndex.getContainer(keyhash, true, -1);
final StringBuffer result = new StringBuffer(1024); final StringBuffer result = new StringBuffer(1024);
if (index.size() == 0) { if (index.size() == 0) {
result.append("No URL entries related to this word hash <span class=\"tt\">").append(keyhash).append("</span>."); result.append("No URL entries related to this word hash <span class=\"tt\">").append(keyhash).append("</span>.");
} else { } else {
final Iterator en = index.elements(true); final Iterator en = index.entries();
result.append("URL entries related to this word hash <span class=\"tt\">").append(keyhash).append("</span><br><br>"); result.append("URL entries related to this word hash <span class=\"tt\">").append(keyhash).append("</span><br><br>");
result.append("<form action=\"IndexControl_p.html\" method=\"post\" enctype=\"multipart/form-data\">"); result.append("<form action=\"IndexControl_p.html\" method=\"post\" enctype=\"multipart/form-data\">");
String us; String us;
@ -497,13 +497,12 @@ public class IndexControl_p {
.append("<span class=\"small\">for every resolveable and deleted URL reference, delete the same reference at every other word where the reference exists (very extensive, but prevents further unresolved references)</span>") .append("<span class=\"small\">for every resolveable and deleted URL reference, delete the same reference at every other word where the reference exists (very extensive, but prevents further unresolved references)</span>")
.append("</td></tr></table></fieldset></form><br>"); .append("</td></tr></table></fieldset></form><br>");
} }
index.close();
index = null; index = null;
return result.toString(); return result.toString();
} catch (IOException e) { } catch (IOException e) {
return ""; return "";
} finally { } finally {
if (index != null) try { index.close(); index = null; } catch (Exception e) {}; if (index != null) index = null;
} }
} }

@ -463,7 +463,7 @@ public class dir {
"AAAAAAAAAAAA", /*referrer*/ "AAAAAAAAAAAA", /*referrer*/
0, /*copycount*/ 0, /*copycount*/
false, /*localneed*/ false, /*localneed*/
condenser.RESULT_INFORMATION_VALUE, condenser.RESULT_WORD_ENTROPHY,
"**", /*language*/ "**", /*language*/
plasmaWordIndexEntry.DT_SHARE, /*doctype*/ plasmaWordIndexEntry.DT_SHARE, /*doctype*/
phrase.length(), /*size*/ phrase.length(), /*size*/

@ -126,7 +126,12 @@ public class index {
// SEARCH // SEARCH
// process search words // process search words
final String querystring = post.get("search", ""); int maxDistance = Integer.MAX_VALUE;
String querystring = post.get("search", "").trim();
if ((querystring.charAt(0) == '"') && (querystring.charAt(querystring.length() - 1) == '"')) {
querystring = querystring.substring(1, querystring.length() - 1).trim();
maxDistance = 1;
}
if (sb.facilityDB != null) try { sb.facilityDB.update("zeitgeist", querystring, post); } catch (Exception e) {} if (sb.facilityDB != null) try { sb.facilityDB.update("zeitgeist", querystring, post); } catch (Exception e) {}
final TreeSet query = plasmaSearchQuery.cleanQuery(querystring); final TreeSet query = plasmaSearchQuery.cleanQuery(querystring);
// filter out stopwords // filter out stopwords
@ -172,7 +177,7 @@ public class index {
} }
// do the search // do the search
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, new String[]{order1, order2, order3}, count, searchtime, urlmask, referer, plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, maxDistance, new String[]{order1, order2, order3}, count, searchtime, urlmask, referer,
((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL, ((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL,
"", 20); "", 20);
final serverObjects prop = sb.searchFromLocal(thisSearch); final serverObjects prop = sb.searchFromLocal(thisSearch);

@ -47,7 +47,6 @@
// javac -classpath .:../../Classes search.java // javac -classpath .:../../Classes search.java
// if the shell's current path is htroot/yacy // if the shell's current path is htroot/yacy
import java.io.IOException;
import java.util.HashSet; import java.util.HashSet;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
@ -81,6 +80,7 @@ public final class search {
// final String fwden = post.get("fwden", ""); // forward deny, a list of seed hashes. They may NOT be target of forward hopping // final String fwden = post.get("fwden", ""); // forward deny, a list of seed hashes. They may NOT be target of forward hopping
final long duetime= post.getLong("duetime", 3000); final long duetime= post.getLong("duetime", 3000);
final int count = post.getInt("count", 10); // maximum number of wanted results final int count = post.getInt("count", 10); // maximum number of wanted results
final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE);
// final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers // final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers
// Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time // Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time
@ -103,7 +103,7 @@ public final class search {
} }
final long timestamp = System.currentTimeMillis(); final long timestamp = System.currentTimeMillis();
plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, new String[]{plasmaSearchQuery.ORDER_YBR, plasmaSearchQuery.ORDER_DATE, plasmaSearchQuery.ORDER_QUALITY}, plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, maxdist, new String[]{plasmaSearchQuery.ORDER_YBR, plasmaSearchQuery.ORDER_DATE, plasmaSearchQuery.ORDER_QUALITY},
count, duetime, ".*"); count, duetime, ".*");
squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
@ -114,11 +114,8 @@ public final class search {
plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache); plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache);
plasmaSearchResult acc = null; plasmaSearchResult acc = null;
int idxc = 0; int idxc = 0;
try {
idxc = theSearch.localSearch(); idxc = theSearch.localSearch();
acc = theSearch.order(); acc = theSearch.order();
} catch (IOException e) {
}
// result is a List of urlEntry elements // result is a List of urlEntry elements
if ((idxc == 0) || (acc == null)) { if ((idxc == 0) || (acc == null)) {

@ -75,6 +75,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
linkTags0.add("img"); linkTags0.add("img");
linkTags0.add("base"); linkTags0.add("base");
linkTags0.add("frame"); linkTags0.add("frame");
linkTags0.add("meta");
linkTags1 = new TreeSet(insensitiveCollator); linkTags1 = new TreeSet(insensitiveCollator);
linkTags1.add("a"); linkTags1.add("a");
@ -88,6 +89,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
// class variables: collectors for links // class variables: collectors for links
private HashMap anchors; private HashMap anchors;
private HashMap images; private HashMap images;
private HashMap metas;
private String title; private String title;
//private String headline; //private String headline;
private List[] headlines; private List[] headlines;
@ -101,6 +103,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
this.root = root; this.root = root;
this.anchors = new HashMap(); this.anchors = new HashMap();
this.images = new HashMap(); this.images = new HashMap();
this.metas = new HashMap();
this.title = ""; this.title = "";
this.headlines = new ArrayList[4]; this.headlines = new ArrayList[4];
for (int i = 0; i < 4; i++) headlines[i] = new ArrayList(); for (int i = 0; i < 4; i++) headlines[i] = new ArrayList();
@ -194,6 +197,11 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
} }
} }
public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
public static String[] urlComps(String normalizedURL) {
return normalizedURL.toLowerCase().split(splitrex); // word components of the url
}
private String absolutePath(String relativePath) { private String absolutePath(String relativePath) {
try { try {
return urlNormalform(new URL(root, relativePath)); return urlNormalform(new URL(root, relativePath));
@ -206,6 +214,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
if (tagname.equalsIgnoreCase("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt","")); if (tagname.equalsIgnoreCase("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt",""));
if (tagname.equalsIgnoreCase("base")) try {root = new URL(tagopts.getProperty("href", ""));} catch (MalformedURLException e) {} if (tagname.equalsIgnoreCase("base")) try {root = new URL(tagopts.getProperty("href", ""));} catch (MalformedURLException e) {}
if (tagname.equalsIgnoreCase("frame")) anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name","")); if (tagname.equalsIgnoreCase("frame")) anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name",""));
if (tagname.equalsIgnoreCase("meta")) metas.put((tagopts.getProperty("name", "")).toLowerCase(), tagopts.getProperty("content",""));
} }
public void scrapeTag1(String tagname, Properties tagopts, byte[] text) { public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
@ -252,10 +261,16 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
// construct a title string, even if the document has no title // construct a title string, even if the document has no title
// if there is one, return it // if there is one, return it
if (title.length() > 0) return title; if (title.length() > 0) return title;
// othervise take any headline // othervise take any headline
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
if (headlines[i].size() > 0) return (String) headlines[i].get(0); if (headlines[i].size() > 0) return (String) headlines[i].get(0);
} }
// take description tag
String s = getDescription();
if (s.length() > 0) return s;
// extract headline from content // extract headline from content
if (content.length() > 80) return cleanLine(new String(content.getBytes(), 0, 80)); if (content.length() > 80) return cleanLine(new String(content.getBytes(), 0, 80));
return cleanLine(content.trim().toString()); return cleanLine(content.trim().toString());
@ -280,6 +295,45 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return images; return images;
} }
public Map getMetas() {
return metas;
}
public String getDescription() {
String s = (String) metas.get("description");
if (s == null) return ""; else return s;
}
public String getContentType() {
String s = (String) metas.get("content-type");
if (s == null) return ""; else return s;
}
public String getCopyright() {
String s = (String) metas.get("copyright");
if (s == null) return ""; else return s;
}
public String[] getContentLanguages() {
String s = (String) metas.get("content-language");
if (s == null) s = "";
return s.split(" |,");
}
public String[] getKeywords() {
String s = (String) metas.get("keywords");
if (s == null) s = "";
if (s.length() == 0) {
return getTitle().toLowerCase().split(splitrex);
} else {
return s.split(" |,");
}
}
/*
* (non-Javadoc)
* @see de.anomic.htmlFilter.htmlFilterScraper#close()
*/
public void close() { public void close() {
// free resources // free resources
super.close(); super.close();
@ -298,6 +352,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
} }
System.out.println("ANCHORS :" + anchors.toString()); System.out.println("ANCHORS :" + anchors.toString());
System.out.println("IMAGES :" + images.toString()); System.out.println("IMAGES :" + images.toString());
System.out.println("METAS :" + metas.toString());
System.out.println("TEXT :" + new String(content.getBytes())); System.out.println("TEXT :" + new String(content.getBytes()));
} }

@ -850,11 +850,11 @@ public class kelondroTree extends kelondroRecords implements kelondroIndex {
this.rot = rotating; this.rot = rotating;
ii = new nodeIterator(asc, rot, start); ii = new nodeIterator(asc, rot, start);
nextNode = (ii.hasNext()) ? (Node) ii.next() : null; nextNode = (ii.hasNext()) ? (Node) ii.next() : null;
if (nextNode != null) { if ((nextNode != null) && (nextNode.getKey() != null)) {
int c = objectOrder.compare(firstKey, nextNode.getKey()); int c = objectOrder.compare(firstKey, nextNode.getKey());
if ((c > 0) && (asc)) { if ((c > 0) && (asc)) {
// firstKey > nextNode.getKey() // firstKey > nextNode.getKey()
log.logWarning("CORRECTING ITERATOR: firstKey=" + new String(firstKey) + ", nextNode=" + new String(nextNode.getKey())); if (log != null) log.logWarning("CORRECTING ITERATOR: firstKey=" + new String(firstKey) + ", nextNode=" + new String(nextNode.getKey()));
nextNode = (ii.hasNext()) ? (Node) ii.next() : null; nextNode = (ii.hasNext()) ? (Node) ii.next() : null;
} }
if ((c < 0) && (!(asc))) { if ((c < 0) && (!(asc))) {

@ -83,9 +83,6 @@ public final class plasmaCondenser {
public int RESULT_NUMB_SENTENCES = -1; public int RESULT_NUMB_SENTENCES = -1;
public int RESULT_DIFF_SENTENCES = -1; public int RESULT_DIFF_SENTENCES = -1;
public int RESULT_SIMI_SENTENCES = -1; public int RESULT_SIMI_SENTENCES = -1;
public int RESULT_AVERAGE_WORD_OCC = -1;
public int RESULT_INFORMATION_VALUE = -1;
public plasmaCondenser(InputStream text) { public plasmaCondenser(InputStream text) {
this(text, 3, 2); this(text, 3, 2);
@ -357,8 +354,7 @@ public final class plasmaCondenser {
this.RESULT_NUMB_SENTENCES = allsentencecounter; this.RESULT_NUMB_SENTENCES = allsentencecounter;
this.RESULT_DIFF_SENTENCES = sentenceHandleCount; this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
this.RESULT_SIMI_SENTENCES = sentences.size(); this.RESULT_SIMI_SENTENCES = sentences.size();
this.RESULT_AVERAGE_WORD_OCC = (words.size() == 0) ? 0 : (allwordcounter / words.size()); //this.RESULT_INFORMATION_VALUE = (allwordcounter == 0) ? 0 : (wordenum.count() * words.size() / allwordcounter / 16);
this.RESULT_INFORMATION_VALUE = (allwordcounter == 0) ? 0 : (wordenum.count() * words.size() / allwordcounter / 16);
} }
public void print() { public void print() {

@ -176,7 +176,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
gcrawlResultStack.add(urlHash + initiatorHash + executorHash); gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
} }
public synchronized Entry getEntry(String hash, plasmaWordIndexEntry searchedWord) throws IOException { public Entry getEntry(String hash, plasmaWordIndexEntry searchedWord) throws IOException {
return new Entry(hash, searchedWord); return new Entry(hash, searchedWord);
} }
@ -399,7 +399,15 @@ public final class plasmaCrawlLURL extends plasmaURL {
private int size; private int size;
private int wordCount; private int wordCount;
private String snippet; private String snippet;
private plasmaWordIndexEntry word; private plasmaWordIndexEntry word; // this is only used if the url is transported via remote search requests
// more needed attributes:
// - author / copyright owner
// - keywords
// - phrasecount, total number of phrases
// - boolean: URL attributes
// - int: # of outlinks to same domain
// - int: # of outlinks to outside domain
public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) { public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) {
// create new entry and store it into database // create new entry and store it into database

@ -63,7 +63,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
private plasmaWordIndex wordIndex; private plasmaWordIndex wordIndex;
private plasmaCrawlLURL urlStore; private plasmaCrawlLURL urlStore;
private plasmaSnippetCache snippetCache; private plasmaSnippetCache snippetCache;
private plasmaWordIndexEntity rcLocal, rcGlobal; // caches for results private plasmaWordIndexEntryContainer rcLocal, rcGlobal; // caches for results
private plasmaSearchProfile profileLocal, profileGlobal; private plasmaSearchProfile profileLocal, profileGlobal;
private yacySearch[] searchThreads; private yacySearch[] searchThreads;
@ -73,8 +73,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
this.query = query; this.query = query;
this.urlStore = urlStore; this.urlStore = urlStore;
this.snippetCache = snippetCache; this.snippetCache = snippetCache;
this.rcLocal = new plasmaWordIndexEntity(null); this.rcLocal = new plasmaWordIndexEntryContainer(null);
this.rcGlobal = new plasmaWordIndexEntity(null); this.rcGlobal = new plasmaWordIndexEntryContainer(null);
if (query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) { if (query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) {
this.profileLocal = new plasmaSearchProfile(4 * query.maximumTime / 10, query.wantedResults); this.profileLocal = new plasmaSearchProfile(4 * query.maximumTime / 10, query.wantedResults);
this.profileGlobal = new plasmaSearchProfile(6 * query.maximumTime / 10, query.wantedResults); this.profileGlobal = new plasmaSearchProfile(6 * query.maximumTime / 10, query.wantedResults);
@ -114,7 +114,6 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
int globalContributions = globalSearch(fetchpeers); int globalContributions = globalSearch(fetchpeers);
log.logFine("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds"); log.logFine("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
try {
// combine the result and order // combine the result and order
plasmaSearchResult result = order(); plasmaSearchResult result = order();
result.globalContributions = globalContributions; result.globalContributions = globalContributions;
@ -125,57 +124,46 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
//serverInstantThread.oneTimeJob(this, "flushResults", log, 0); //serverInstantThread.oneTimeJob(this, "flushResults", log, 0);
// clean up // clean up
if ((rcLocal != null) && (!(rcLocal.isTMPEntity()))) rcLocal.close();
rcLocal = null; rcLocal = null;
// return search result // return search result
log.logFine("SEARCHRESULT: " + profileLocal.reportToString()); log.logFine("SEARCHRESULT: " + profileLocal.reportToString());
lastEvent = this; lastEvent = this;
return result; return result;
} catch (IOException e) {
return null;
}
} else { } else {
// do a local search
//long start = System.currentTimeMillis();
try {
localSearch(); localSearch();
plasmaSearchResult result = order(); plasmaSearchResult result = order();
result.localContributions = rcLocal.size(); result.localContributions = rcLocal.size();
// clean up // clean up
if ((rcLocal != null) && (!(rcLocal.isTMPEntity()))) rcLocal.close();
rcLocal = null; rcLocal = null;
// return search result // return search result
log.logFine("SEARCHRESULT: " + profileLocal.reportToString()); log.logFine("SEARCHRESULT: " + profileLocal.reportToString());
lastEvent = this; lastEvent = this;
return result; return result;
} catch (IOException e) {
return null;
}
} }
} }
public int localSearch() throws IOException { public int localSearch() {
// search for the set of hashes and return an array of urlEntry elements // search for the set of hashes and return an array of urlEntry elements
// retrieve entities that belong to the hashes // retrieve entities that belong to the hashes
profileLocal.startTimer(); profileLocal.startTimer();
Set entities = wordIndex.getEntities(query.queryHashes, true, true, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_COLLECTION)); Set containers = wordIndex.getContainers(query.queryHashes, true, true, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_COLLECTION));
if (entities.size() < query.size()) entities = null; // prevent that only a subset is returned if (containers.size() < query.size()) containers = null; // prevent that only a subset is returned
profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_COLLECTION); profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_COLLECTION);
profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_COLLECTION, (entities == null) ? 0 : entities.size()); profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_COLLECTION, (containers == null) ? 0 : containers.size());
// since this is a conjunction we return an empty entity if any word is not known // since this is a conjunction we return an empty entity if any word is not known
if (entities == null) { if (containers == null) {
rcLocal = new plasmaWordIndexEntity(null); rcLocal = new plasmaWordIndexEntryContainer(null);
return 0; return 0;
} }
// join the result // join the result
profileLocal.startTimer(); profileLocal.startTimer();
rcLocal = plasmaWordIndexEntity.joinEntities(entities, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_JOIN)); rcLocal = plasmaWordIndexEntryContainer.joinContainer(containers, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_JOIN), query.maxDistance);
profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_JOIN); profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_JOIN);
profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_JOIN, rcLocal.size()); profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_JOIN, rcLocal.size());
@ -190,7 +178,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
log.logFine("STARTING " + fetchpeers + " THREADS TO CATCH EACH " + profileGlobal.getTargetCount(plasmaSearchProfile.PROCESS_POSTSORT) + " URLs WITHIN " + (profileGlobal.duetime() / 1000) + " SECONDS"); log.logFine("STARTING " + fetchpeers + " THREADS TO CATCH EACH " + profileGlobal.getTargetCount(plasmaSearchProfile.PROCESS_POSTSORT) + " URLs WITHIN " + (profileGlobal.duetime() / 1000) + " SECONDS");
long timeout = System.currentTimeMillis() + profileGlobal.duetime() + 4000; long timeout = System.currentTimeMillis() + profileGlobal.duetime() + 4000;
searchThreads = yacySearch.searchHashes(query.queryHashes, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal); searchThreads = yacySearch.searchHashes(query.queryHashes, query.maxDistance, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal);
// wait until wanted delay passed or wanted result appeared // wait until wanted delay passed or wanted result appeared
while (System.currentTimeMillis() < timeout) { while (System.currentTimeMillis() < timeout) {
@ -204,20 +192,20 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
return rcGlobal.size(); return rcGlobal.size();
} }
public plasmaSearchResult order() throws IOException { public plasmaSearchResult order() {
// we collect the urlhashes and construct a list with urlEntry objects // we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime // attention: if minEntries is too high, this method will not terminate within the maxTime
plasmaWordIndexEntity searchResult = new plasmaWordIndexEntity(null); plasmaWordIndexEntryContainer searchResult = new plasmaWordIndexEntryContainer(null);
searchResult.merge(rcLocal, -1); searchResult.add(rcLocal);
searchResult.merge(rcGlobal, -1); searchResult.add(rcGlobal);
long preorderTime = profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_PRESORT); long preorderTime = profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_PRESORT);
long postorderTime = profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_POSTSORT); long postorderTime = profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_POSTSORT);
profileLocal.startTimer(); profileLocal.startTimer();
plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query); plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query);
preorder.addEntity(searchResult, preorderTime); preorder.addContainer(searchResult, preorderTime);
profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_PRESORT); profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_PRESORT);
profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_PRESORT, rcLocal.size()); profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_PRESORT, rcLocal.size());
@ -289,19 +277,13 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
Iterator hashi = query.queryHashes.iterator(); Iterator hashi = query.queryHashes.iterator();
while (hashi.hasNext()) { while (hashi.hasNext()) {
wordHash = (String) hashi.next(); wordHash = (String) hashi.next();
Iterator i = rcGlobal.elements(true); rcGlobal.setWordHash(wordHash);
plasmaWordIndexEntry entry; wordIndex.addEntries(rcGlobal, true);
plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash, rcGlobal.size()); log.logFine("FLUSHED " + wordHash + ": " + rcGlobal.size() + " url entries");
while (i.hasNext()) {
entry = (plasmaWordIndexEntry) i.next();
container.add(entry, System.currentTimeMillis());
}
wordIndex.addEntries(container, true);
log.logFine("FLUSHED " + wordHash + ": " + container.size() + " url entries");
} }
// the rcGlobal was flushed, empty it // the rcGlobal was flushed, empty it
count += rcGlobal.size(); count += rcGlobal.size();
rcGlobal.deleteComplete(); rcGlobal.clear();
} }
// wait a little bit before trying again // wait a little bit before trying again
try {Thread.sleep(3000);} catch (InterruptedException e) {} try {Thread.sleep(3000);} catch (InterruptedException e) {}

@ -116,8 +116,8 @@ public final class plasmaSearchPreOrder {
return (plasmaWordIndexEntry) pageAcc.remove(top); return (plasmaWordIndexEntry) pageAcc.remove(top);
} }
public void addEntity(plasmaWordIndexEntity entity, long maxTime) { public void addContainer(plasmaWordIndexEntryContainer container, long maxTime) {
Iterator i = entity.elements(true); Iterator i = container.entries();
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
plasmaWordIndexEntry entry; plasmaWordIndexEntry entry;
while (i.hasNext()) { while (i.hasNext()) {

@ -72,12 +72,14 @@ public final class plasmaSearchQuery {
public int domType; public int domType;
public String domGroupName; public String domGroupName;
public int domMaxTargets; public int domMaxTargets;
public int maxDistance;
public plasmaSearchQuery(Set queryWords, public plasmaSearchQuery(Set queryWords, int maxDistance,
String[] order, int wantedResults, long maximumTime, String urlMask, String[] order, int wantedResults, long maximumTime, String urlMask,
String referrer, String referrer,
int domType, String domGroupName, int domMaxTargets) { int domType, String domGroupName, int domMaxTargets) {
this.queryWords = queryWords; this.queryWords = queryWords;
this.maxDistance = maxDistance;
this.queryHashes = words2hashes(queryWords); this.queryHashes = words2hashes(queryWords);
this.order = order; this.order = order;
this.wantedResults = wantedResults; this.wantedResults = wantedResults;
@ -89,9 +91,10 @@ public final class plasmaSearchQuery {
this.domMaxTargets = domMaxTargets; this.domMaxTargets = domMaxTargets;
} }
public plasmaSearchQuery(Set queryHashes, public plasmaSearchQuery(Set queryHashes, int maxDistance,
String[] order, int wantedResults, long maximumTime, String urlMask) { String[] order, int wantedResults, long maximumTime, String urlMask) {
this.queryWords = null; this.queryWords = null;
this.maxDistance = maxDistance;
this.queryHashes = queryHashes; this.queryHashes = queryHashes;
this.order = order; this.order = order;
this.wantedResults = wantedResults; this.wantedResults = wantedResults;

@ -54,11 +54,10 @@ import java.net.MalformedURLException;
import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.serverCodings; import de.anomic.server.serverCodings;
import de.anomic.htmlFilter.htmlFilterContentScraper;
public final class plasmaSearchResult { public final class plasmaSearchResult {
public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic
private ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects private ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects
@ -111,8 +110,8 @@ public final class plasmaSearchResult {
URL url = page.url(); URL url = page.url();
String descr = page.descr(); String descr = page.descr();
if ((url == null) || (descr == null)) return; if ((url == null) || (descr == null)) return;
String[] urlcomps = url.toString().toLowerCase().split(splitrex); // word components of the url String[] urlcomps = htmlFilterContentScraper.urlComps(url.toString()); // word components of the url
String[] descrcomps = descr.toLowerCase().split(splitrex); // words in the description String[] descrcomps = descr.toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description
// store everything // store everything
Object[] resultVector = new Object[] {indexEntry, page, urlcomps, descrcomps}; Object[] resultVector = new Object[] {indexEntry, page, urlcomps, descrcomps};

@ -1285,7 +1285,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
referrerHash, referrerHash,
0, true, 0, true,
condenser.RESULT_INFORMATION_VALUE, condenser.RESULT_WORD_ENTROPHY,
plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.language(entry.url()),
plasmaWordIndexEntry.docType(document.getMimeType()), plasmaWordIndexEntry.docType(document.getMimeType()),
(int) entry.size(), (int) entry.size(),
@ -1313,15 +1313,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} else { } else {
HashMap urlCache = new HashMap(1); HashMap urlCache = new HashMap(1);
urlCache.put(newEntry.hash(),newEntry); urlCache.put(newEntry.hash(),newEntry);
ArrayList tmpEntities = new ArrayList(condenser.RESULT_SIMI_WORDS); ArrayList tmpContainers = new ArrayList(condenser.RESULT_SIMI_WORDS);
String language = plasmaWordIndexEntry.language(entry.url()); String language = plasmaWordIndexEntry.language(entry.url());
char doctype = plasmaWordIndexEntry.docType(document.getMimeType()); char doctype = plasmaWordIndexEntry.docType(document.getMimeType());
int quality = 0; int urlLength = newEntry.url().toString().length();
try { int urlComps = htmlFilterContentScraper.urlComps(newEntry.url().toString()).length;
quality = condenser.RESULT_INFORMATION_VALUE;
} catch (NumberFormatException e) {
System.out.println("INTERNAL ERROR WITH CONDENSER.INFORMATION_VALUE: " + e.toString() + ": in URL " + newEntry.url().toString());
}
// iterate over all words // iterate over all words
Iterator i = condenser.words(); Iterator i = condenser.words();
@ -1332,8 +1328,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String word = (String) wentry.getKey(); String word = (String) wentry.getKey();
wordStat = (plasmaCondenser.wordStatProp) wentry.getValue(); wordStat = (plasmaCondenser.wordStatProp) wentry.getValue();
String wordHash = plasmaWordIndexEntry.word2hash(word); String wordHash = plasmaWordIndexEntry.word2hash(word);
plasmaWordIndexEntity wordIdxEntity = new plasmaWordIndexEntity(wordHash); plasmaWordIndexEntryContainer wordIdxContainer = new plasmaWordIndexEntryContainer(wordHash);
plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash, plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash,
urlLength, urlComps,
wordStat.count, wordStat.count,
condenser.RESULT_SIMI_WORDS, condenser.RESULT_SIMI_WORDS,
condenser.RESULT_SIMI_SENTENCES, condenser.RESULT_SIMI_SENTENCES,
@ -1344,26 +1341,25 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
newEntry.size(), newEntry.size(),
docDate.getTime(), docDate.getTime(),
System.currentTimeMillis(), System.currentTimeMillis(),
quality, language, doctype, true); condenser.RESULT_WORD_ENTROPHY,
wordIdxEntity.addEntry(wordIdxEntry); language,
tmpEntities.add(wordIdxEntity); doctype,
true);
wordIdxContainer.add(wordIdxEntry);
tmpContainers.add(wordIdxContainer);
// wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry)); // wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry));
} }
//System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries"); //System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries");
words = condenser.RESULT_SIMI_WORDS; words = condenser.RESULT_SIMI_WORDS;
// transfering the index to the storage peer // transfering the index to the storage peer
String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntity[])tmpEntities.toArray(new plasmaWordIndexEntity[tmpEntities.size()]),urlCache,true,120000); String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntryContainer[])tmpContainers.toArray(new plasmaWordIndexEntity[tmpContainers.size()]),urlCache,true,120000);
if (error != null) { if (error != null) {
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType())); words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()));
} }
// cleanup tmpContainers = null;
for (int j=0; j < tmpEntities.size(); j++) {
plasmaWordIndexEntity tmpEntity = (plasmaWordIndexEntity) tmpEntities.get(j);
try { tmpEntity.close(); } catch (Exception e) {}
}
} }
storageEndTime = System.currentTimeMillis(); storageEndTime = System.currentTimeMillis();

@ -56,6 +56,7 @@ import java.util.Set;
import java.util.Date; import java.util.Date;
import java.net.URL; import java.net.URL;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
@ -136,15 +137,7 @@ public final class plasmaWordIndex {
public int addPageIndex(URL url, String urlHash, Date urlModified, int size, plasmaCondenser condenser, String language, char doctype) { public int addPageIndex(URL url, String urlHash, Date urlModified, int size, plasmaCondenser condenser, String language, char doctype) {
// this is called by the switchboard to put in a new page into the index // this is called by the switchboard to put in a new page into the index
// use all the words in one condenser object to simultanous create index // use all the words in one condenser object to simultanous create index entries
// entries
// int age = microDateDays(urlModified);
int quality = 0;
try {
quality = condenser.RESULT_INFORMATION_VALUE;
} catch (NumberFormatException e) {
System.out.println("INTERNAL ERROR WITH CONDENSER.INFORMATION_VALUE: " + e.toString() + ": in URL " + url.toString());
}
// iterate over all words // iterate over all words
Iterator i = condenser.words(); Iterator i = condenser.words();
@ -153,6 +146,9 @@ public final class plasmaWordIndex {
plasmaWordIndexEntry ientry; plasmaWordIndexEntry ientry;
plasmaCondenser.wordStatProp wprop; plasmaCondenser.wordStatProp wprop;
String wordHash; String wordHash;
int urlLength = url.toString().length();
int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length;
while (i.hasNext()) { while (i.hasNext()) {
wentry = (Map.Entry) i.next(); wentry = (Map.Entry) i.next();
word = (String) wentry.getKey(); word = (String) wentry.getKey();
@ -160,6 +156,7 @@ public final class plasmaWordIndex {
// if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c); // if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
wordHash = plasmaWordIndexEntry.word2hash(word); wordHash = plasmaWordIndexEntry.word2hash(word);
ientry = new plasmaWordIndexEntry(urlHash, ientry = new plasmaWordIndexEntry(urlHash,
urlLength, urlComps,
wprop.count, wprop.count,
condenser.RESULT_SIMI_WORDS, condenser.RESULT_SIMI_WORDS,
condenser.RESULT_SIMI_SENTENCES, condenser.RESULT_SIMI_SENTENCES,
@ -170,7 +167,10 @@ public final class plasmaWordIndex {
size, size,
urlModified.getTime(), urlModified.getTime(),
System.currentTimeMillis(), System.currentTimeMillis(),
quality, language, doctype, true); condenser.RESULT_WORD_ENTROPHY,
language,
doctype,
true);
addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), ientry), false); addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), ientry), false);
} }
// System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + // System.out.println("DEBUG: plasmaSearch.addPageIndex: added " +
@ -178,10 +178,43 @@ public final class plasmaWordIndex {
return condenser.RESULT_SIMI_WORDS; return condenser.RESULT_SIMI_WORDS;
} }
public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
return ramCache.getContainer(wordHash, deleteIfEmpty, maxTime);
}
public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) { public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) {
return ramCache.getIndex(wordHash, deleteIfEmpty, maxTime); return ramCache.getEntity(wordHash, deleteIfEmpty, maxTime);
} }
public Set getContainers(Set wordHashes, boolean deleteIfEmpty, boolean interruptIfEmpty, long maxTime) {
// retrieve entities that belong to the hashes
HashSet containers = new HashSet();
String singleHash;
plasmaWordIndexEntryContainer singleContainer;
Iterator i = wordHashes.iterator();
long start = System.currentTimeMillis();
long remaining;
while (i.hasNext()) {
// check time
remaining = maxTime - (System.currentTimeMillis() - start);
//if ((maxTime > 0) && (remaining <= 0)) break;
// get next hash:
singleHash = (String) i.next();
// retrieve index
singleContainer = getContainer(singleHash, deleteIfEmpty, (maxTime < 0) ? -1 : remaining / (wordHashes.size() - containers.size()));
// check result
if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashSet();
containers.add(singleContainer);
}
return containers;
}
/*
public Set getEntities(Set wordHashes, boolean deleteIfEmpty, boolean interruptIfEmpty, long maxTime) { public Set getEntities(Set wordHashes, boolean deleteIfEmpty, boolean interruptIfEmpty, long maxTime) {
// retrieve entities that belong to the hashes // retrieve entities that belong to the hashes
@ -203,12 +236,13 @@ public final class plasmaWordIndex {
singleEntity = getEntity(singleHash, deleteIfEmpty, (maxTime < 0) ? -1 : remaining / (wordHashes.size() - entities.size())); singleEntity = getEntity(singleHash, deleteIfEmpty, (maxTime < 0) ? -1 : remaining / (wordHashes.size() - entities.size()));
// check result // check result
if (((singleEntity == null) || (singleEntity.size() == 0)) && (interruptIfEmpty)) return null; if (((singleEntity == null) || (singleEntity.size() == 0)) && (interruptIfEmpty)) return new HashSet();
entities.add(singleEntity); entities.add(singleEntity);
} }
return entities; return entities;
} }
*/
public int size() { public int size() {
return ramCache.size(); return ramCache.size();

@ -203,7 +203,7 @@ public final class plasmaWordIndexAssortmentCluster {
} }
public plasmaWordIndexEntryContainer removeFromAll(String wordHash, long maxTime) { public plasmaWordIndexEntryContainer removeFromAll(String wordHash, long maxTime) {
// collect all records from all the assortments and return them // removes all records from all the assortments and return them
plasmaWordIndexEntryContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash); plasmaWordIndexEntryContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash);
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
for (int i = 0; i < clusterCount; i++) { for (int i = 0; i < clusterCount; i++) {
@ -214,6 +214,18 @@ public final class plasmaWordIndexAssortmentCluster {
return record; return record;
} }
public plasmaWordIndexEntryContainer getFromAll(String wordHash, long maxTime) {
// collect all records from all the assortments and return them
plasmaWordIndexEntryContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash);
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
for (int i = 0; i < clusterCount; i++) {
buffer = assortments[i].get(wordHash);
if (buffer != null) record.add(buffer);
if (System.currentTimeMillis() > limitTime) break;
}
return record;
}
public Iterator hashConjunction(String startWordHash, boolean up, boolean rot) { public Iterator hashConjunction(String startWordHash, boolean up, boolean rot) {
HashSet iterators = new HashSet(); HashSet iterators = new HashSet();
//if (rot) System.out.println("WARNING: kelondroMergeIterator does not work correctly when individual iterators rotate on their own!"); //if (rot) System.out.println("WARNING: kelondroMergeIterator does not work correctly when individual iterators rotate on their own!");

@ -391,7 +391,18 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
} }
} }
public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty, long maxTime) { public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
long start = System.currentTimeMillis();
if (maxTime > 0) maxTime = 8 * maxTime / 10; // reserve time for later adding to backend
plasmaWordIndexEntryContainer container = assortmentCluster.getFromAll(wordHash, maxTime);
if (container == null) {
container = new plasmaWordIndexEntryContainer(wordHash);
}
container.add(backend.getContainer(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : System.currentTimeMillis() - start));
return container;
}
public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) {
// this possibly creates an index file in the back-end // this possibly creates an index file in the back-end
// the index file is opened and returned as entity object // the index file is opened and returned as entity object
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
@ -406,7 +417,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
} }
} }
long r = maxTime - (System.currentTimeMillis() - start); long r = maxTime - (System.currentTimeMillis() - start);
return backend.getIndex(wordHash, deleteIfEmpty, (r < 0) ? 0 : r); return backend.getEntity(wordHash, deleteIfEmpty, (r < 0) ? 0 : r);
} }
public long getUpdateTime(String wordHash) { public long getUpdateTime(String wordHash) {

@ -181,7 +181,24 @@ public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface {
} }
} }
public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty, long maxTime) { public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
long start = System.currentTimeMillis();
if (plasmaWordIndexEntity.wordHash2path(databaseRoot, wordHash).exists()) {
plasmaWordIndexEntity entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10);
plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash);
plasmaWordIndexEntry entry;
Iterator i = entity.elements(true);
while ((i.hasNext()) && ((maxTime < 0) || (System.currentTimeMillis() < start + maxTime))) {
entry = (plasmaWordIndexEntry) i.next();
container.add(entry);
}
return container;
} else {
return new plasmaWordIndexEntryContainer(wordHash, 0);
}
}
public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) {
return new plasmaWordIndexEntity(databaseRoot, wordHash, deleteIfEmpty); return new plasmaWordIndexEntity(databaseRoot, wordHash, deleteIfEmpty);
} }
@ -190,7 +207,6 @@ public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface {
if (f.exists()) return f.lastModified(); else return -1; if (f.exists()) return f.lastModified(); else return -1;
} }
public void deleteIndex(String wordHash) { public void deleteIndex(String wordHash) {
plasmaWordIndexEntity.removePlasmaIndex(databaseRoot, wordHash); plasmaWordIndexEntity.removePlasmaIndex(databaseRoot, wordHash);
} }
@ -200,7 +216,7 @@ public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface {
plasmaWordIndexEntity pi = null; plasmaWordIndexEntity pi = null;
int count = 0; int count = 0;
try { try {
pi = getIndex(wordHash, true, -1); pi = getEntity(wordHash, true, -1);
for (int i = 0; i < urlHashes.length; i++) for (int i = 0; i < urlHashes.length; i++)
if (pi.removeEntry(urlHashes[i], deleteComplete)) count++; if (pi.removeEntry(urlHashes[i], deleteComplete)) count++;
int size = pi.size(); int size = pi.size();

@ -201,33 +201,33 @@ public final class plasmaWordIndexDistribution {
// collect index // collect index
String startPointHash = selectTransferStart(); String startPointHash = selectTransferStart();
log.logFine("Selected hash " + startPointHash + " as start point for index distribution, distance = " + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, startPointHash)); log.logFine("Selected hash " + startPointHash + " as start point for index distribution, distance = " + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, startPointHash));
Object[] selectResult = selectTransferIndexes(startPointHash, indexCount, this.maxOpenFiles4Distribution); Object[] selectResult = selectTransferContainers(startPointHash, indexCount, this.maxOpenFiles4Distribution);
plasmaWordIndexEntity[] indexEntities = (plasmaWordIndexEntity[]) selectResult[0]; plasmaWordIndexEntryContainer[] indexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0];
//Integer openedFiles = (Integer) selectResult[2]; //Integer openedFiles = (Integer) selectResult[2];
HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry
if ((indexEntities == null) || (indexEntities.length == 0)) { if ((indexContainers == null) || (indexContainers.length == 0)) {
log.logFine("No index available for index transfer, hash start-point " + startPointHash); log.logFine("No index available for index transfer, hash start-point " + startPointHash);
return -1; return -1;
} }
// count the indexes again, can be smaller as expected // count the indexes again, can be smaller as expected
indexCount = 0; indexCount = 0;
for (int i = 0; i < indexEntities.length; i++) { for (int i = 0; i < indexContainers.length; i++) {
indexCount += indexEntities[i].size(); indexCount += indexContainers[i].size();
} }
if (indexCount < 50) { if (indexCount < 50) {
log.logFine("Too few (" + indexCount + ") indexes selected for transfer."); log.logFine("Too few (" + indexCount + ") indexes selected for transfer.");
closeTransferIndexes (indexEntities); closeTransferIndexes(indexContainers);
return -1; // failed return -1; // failed
} }
// find start point for DHT-selection // find start point for DHT-selection
String keyhash = indexEntities[indexEntities.length - 1].wordHash(); // DHT targets must have greater hashes String keyhash = indexContainers[indexContainers.length - 1].wordHash(); // DHT targets must have greater hashes
// find a list of DHT-peers // find a list of DHT-peers
yacySeed[] seeds = new yacySeed[peerCount + 10]; yacySeed[] seeds = new yacySeed[peerCount + 10];
int hc0 = 0; int hc0 = 0;
double ownDistance = Math.min(yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexEntities[0].wordHash()), double ownDistance = Math.min(yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexContainers[0].wordHash()),
yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexEntities[indexEntities.length - 1].wordHash())); yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexContainers[indexContainers.length - 1].wordHash()));
double maxDistance = Math.min(ownDistance, 0.4); double maxDistance = Math.min(ownDistance, 0.4);
synchronized (yacyCore.dhtAgent) { synchronized (yacyCore.dhtAgent) {
double avdist; double avdist;
@ -239,8 +239,8 @@ public final class plasmaWordIndexDistribution {
} }
seeds[hc0] = (yacySeed) e.nextElement(); seeds[hc0] = (yacySeed) e.nextElement();
if (seeds[hc0] != null) { if (seeds[hc0] != null) {
avdist = Math.max(yacyDHTAction.dhtDistance(seeds[hc0].hash, indexEntities[0].wordHash()), avdist = Math.max(yacyDHTAction.dhtDistance(seeds[hc0].hash, indexContainers[0].wordHash()),
yacyDHTAction.dhtDistance(seeds[hc0].hash, indexEntities[indexEntities.length - 1].wordHash())); yacyDHTAction.dhtDistance(seeds[hc0].hash, indexContainers[indexContainers.length - 1].wordHash()));
if (avdist < maxDistance) { if (avdist < maxDistance) {
log.logInfo("Selected " + ((hc0 < peerCount) ? "primary" : "reserve") + " DHT target peer " + seeds[hc0].getName() + ":" + seeds[hc0].hash + ", distance = " + avdist); log.logInfo("Selected " + ((hc0 < peerCount) ? "primary" : "reserve") + " DHT target peer " + seeds[hc0].getName() + ":" + seeds[hc0].hash + ", distance = " + avdist);
hc0++; hc0++;
@ -252,7 +252,7 @@ public final class plasmaWordIndexDistribution {
if (hc0 < peerCount) { if (hc0 < peerCount) {
log.logWarning("found not enough (" + hc0 + ") peers for distribution"); log.logWarning("found not enough (" + hc0 + ") peers for distribution");
closeTransferIndexes (indexEntities); closeTransferIndexes(indexContainers);
return -1; // failed return -1; // failed
} }
@ -267,9 +267,9 @@ public final class plasmaWordIndexDistribution {
return -1; // interrupted return -1; // interrupted
} }
start = System.currentTimeMillis(); start = System.currentTimeMillis();
error = yacyClient.transferIndex(seeds[i], indexEntities, urlCache, this.gzipBody4Distribution, this.timeout4Distribution); error = yacyClient.transferIndex(seeds[i], indexContainers, urlCache, this.gzipBody4Distribution, this.timeout4Distribution);
if (error == null) { if (error == null) {
log.logInfo("Index transfer of " + indexCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length - 1].wordHash() + "] to peer " + seeds[i].getName() + ":" + seeds[i].hash + " in " + ((System.currentTimeMillis() - start) / 1000) log.logInfo("Index transfer of " + indexCount + " words [" + indexContainers[0].wordHash() + " .. " + indexContainers[indexContainers.length - 1].wordHash() + "] to peer " + seeds[i].getName() + ":" + seeds[i].hash + " in " + ((System.currentTimeMillis() - start) / 1000)
+ " seconds successfull (" + (1000 * indexCount / (System.currentTimeMillis() - start + 1)) + " words/s)"); + " seconds successfull (" + (1000 * indexCount / (System.currentTimeMillis() - start + 1)) + " words/s)");
peerNames += ", " + seeds[i].getName(); peerNames += ", " + seeds[i].getName();
hc1++; hc1++;
@ -286,8 +286,8 @@ public final class plasmaWordIndexDistribution {
// success // success
if (delete) { if (delete) {
try { try {
if (deleteTransferIndexes(indexEntities)) { if (deleteTransferIndexes(indexContainers)) {
log.logFine("Deleted all " + indexEntities.length + " transferred whole-word indexes locally"); log.logFine("Deleted all " + indexContainers.length + " transferred whole-word indexes locally");
return indexCount; return indexCount;
} else { } else {
log.logSevere("Deleted not all transferred whole-word indexes"); log.logSevere("Deleted not all transferred whole-word indexes");
@ -299,13 +299,13 @@ public final class plasmaWordIndexDistribution {
} }
} else { } else {
// simply close the indexEntities // simply close the indexEntities
closeTransferIndexes (indexEntities); closeTransferIndexes(indexContainers);
} }
return indexCount; return indexCount;
} else { } else {
log.logSevere("Index distribution failed. Too few peers (" + hc1 + ") received the index, not deleted locally."); log.logSevere("Index distribution failed. Too few peers (" + hc1 + ") received the index, not deleted locally.");
// simply close the indexEntities // simply close the indexEntities
closeTransferIndexes (indexEntities); closeTransferIndexes(indexContainers);
return -1; return -1;
} }
} }
@ -322,15 +322,16 @@ public final class plasmaWordIndexDistribution {
return startPointHash; return startPointHash;
} }
Object[] /* of {plasmaWordIndexEntity[], HashMap(String, plasmaCrawlLURL.Entry)}*/ Object[] /* of {plasmaWordIndexEntryContainer[], HashMap(String, plasmaCrawlLURL.Entry)}*/
selectTransferIndexes(String hash, int count, int maxOpenFiles) { selectTransferContainers(String hash, int count, int maxOpenFiles) {
// the hash is a start hash from where the indexes are picked // the hash is a start hash from where the indexes are picked
ArrayList tmpEntities = new ArrayList(count); ArrayList tmpContainers = new ArrayList(count);
String nexthash = ""; String nexthash = "";
try { try {
int currOpenFiles = 0; int currOpenFiles = 0;
Iterator wordHashIterator = this.wordIndex.wordHashes(hash, true, true); Iterator wordHashIterator = this.wordIndex.wordHashes(hash, true, true);
plasmaWordIndexEntity indexEntity, tmpEntity; plasmaWordIndexEntity indexEntity;
plasmaWordIndexEntryContainer indexContainer;
Iterator urlIter; Iterator urlIter;
Iterator hashIter; Iterator hashIter;
plasmaWordIndexEntry indexEntry; plasmaWordIndexEntry indexEntry;
@ -343,56 +344,15 @@ public final class plasmaWordIndexDistribution {
(wordHashIterator.hasNext()) && (wordHashIterator.hasNext()) &&
((nexthash = (String) wordHashIterator.next()) != null) && ((nexthash = (String) wordHashIterator.next()) != null) &&
(nexthash.trim().length() > 0) && (nexthash.trim().length() > 0) &&
((currOpenFiles == 0) || (yacyDHTAction.dhtDistance(nexthash, ((currOpenFiles == 0) ||
((plasmaWordIndexEntity)tmpEntities.get(0)).wordHash()) < 0.2)) (yacyDHTAction.dhtDistance(nexthash, ((plasmaWordIndexEntity)tmpContainers.get(0)).wordHash()) < 0.2))
) { ) {
indexEntity = this.wordIndex.getEntity(nexthash, true, -1); indexEntity = this.wordIndex.getEntity(nexthash, true, -1);
if (indexEntity.size() == 0) {
indexEntity.deleteComplete();
} else if ((indexEntity.size() <= count)|| // if we havn't exceeded the limit
(Math.abs(indexEntity.size() - count) <= 10)){ // or there are only at most 10 entries left
// take the whole entity
try {
// fist check if we know all urls
urlIter = indexEntity.elements(true);
unknownURLEntries.clear();
while (urlIter.hasNext()) {
indexEntry = (plasmaWordIndexEntry) urlIter.next();
try {
lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), indexEntry);
if ((lurl == null) || (lurl.url() == null)) {
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
knownURLs.put(indexEntry.getUrlHash(), lurl);
}
} catch (IOException e) {
unknownURLEntries.add(indexEntry.getUrlHash());
}
}
// now delete all entries that have no url entry
hashIter = unknownURLEntries.iterator();
while (hashIter.hasNext()) {
String nextUrlHash = (String) hashIter.next();
indexEntity.removeEntry(nextUrlHash, false);
this.urlPool.loadedURL.remove(nextUrlHash);
}
if (indexEntity.size() == 0) { if (indexEntity.size() == 0) {
indexEntity.deleteComplete(); indexEntity.deleteComplete();
} else {
// use whats remaining
tmpEntities.add(indexEntity);
this.log.logFine("Selected whole index (" + indexEntity.size() + " URLs, " + unknownURLEntries.size() + " not bound) for word " + indexEntity.wordHash());
count -= indexEntity.size();
currOpenFiles++;
}
} catch (kelondroException e) {
this.log.logSevere("plasmaWordIndexDistribution/1: deleted DB for word " + indexEntity.wordHash(), e);
indexEntity.deleteComplete();
}
} else { } else {
// make an on-the-fly entity and insert values // make an on-the-fly entity and insert values
tmpEntity = new plasmaWordIndexEntity(indexEntity.wordHash()); indexContainer = new plasmaWordIndexEntryContainer(indexEntity.wordHash());
try { try {
urlIter = indexEntity.elements(true); urlIter = indexEntity.elements(true);
unknownURLEntries.clear(); unknownURLEntries.clear();
@ -404,7 +364,7 @@ public final class plasmaWordIndexDistribution {
unknownURLEntries.add(indexEntry.getUrlHash()); unknownURLEntries.add(indexEntry.getUrlHash());
} else { } else {
knownURLs.put(indexEntry.getUrlHash(), lurl); knownURLs.put(indexEntry.getUrlHash(), lurl);
tmpEntity.addEntry(indexEntry); indexContainer.add(indexEntry);
count--; count--;
} }
} catch (IOException e) { } catch (IOException e) {
@ -426,8 +386,8 @@ public final class plasmaWordIndexDistribution {
} }
// use whats remaining // use whats remaining
this.log.logFine("Selected partial index (" + tmpEntity.size() + " from " + indexEntity.size() +" URLs, " + unknownURLEntries.size() + " not bound) for word " + tmpEntity.wordHash()); this.log.logFine("Selected partial index (" + indexContainer.size() + " from " + indexEntity.size() +" URLs, " + unknownURLEntries.size() + " not bound) for word " + indexContainer.wordHash());
tmpEntities.add(tmpEntity); tmpContainers.add(indexContainer);
} catch (kelondroException e) { } catch (kelondroException e) {
this.log.logSevere("plasmaWordIndexDistribution/2: deleted DB for word " + indexEntity.wordHash(), e); this.log.logSevere("plasmaWordIndexDistribution/2: deleted DB for word " + indexEntity.wordHash(), e);
indexEntity.deleteComplete(); indexEntity.deleteComplete();
@ -438,8 +398,8 @@ public final class plasmaWordIndexDistribution {
} }
// transfer to array // transfer to array
plasmaWordIndexEntity[] indexEntities = (plasmaWordIndexEntity[]) tmpEntities.toArray(new plasmaWordIndexEntity[tmpEntities.size()]); plasmaWordIndexEntryContainer[] entryContainers = (plasmaWordIndexEntryContainer[]) tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]);
return new Object[]{indexEntities, knownURLs, new Integer(currOpenFiles)}; return new Object[]{entryContainers, knownURLs, new Integer(currOpenFiles)};
} catch (IOException e) { } catch (IOException e) {
this.log.logSevere("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage(), e); this.log.logSevere("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage(), e);
return new Object[]{new plasmaWordIndexEntity[0], new HashMap(0)}; return new Object[]{new plasmaWordIndexEntity[0], new HashMap(0)};
@ -477,6 +437,40 @@ public final class plasmaWordIndexDistribution {
} catch (IOException ee) {} } catch (IOException ee) {}
} }
void closeTransferIndexes(plasmaWordIndexEntryContainer[] indexContainers) {
for (int i = 0; i < indexContainers.length; i++) {
indexContainers[i] = null;
}
}
boolean deleteTransferIndexes(plasmaWordIndexEntryContainer[] indexContainers) throws IOException {
Iterator urlIter;
plasmaWordIndexEntry indexEntry;
plasmaWordIndexEntity indexEntity;
String[] urlHashes;
int sz;
boolean success = true;
for (int i = 0; i < indexContainers.length; i++) {
// delete entries separately
int c = 0;
urlHashes = new String[indexContainers[i].size()];
urlIter = indexContainers[i].entries();
while (urlIter.hasNext()) {
indexEntry = (plasmaWordIndexEntry) urlIter.next();
urlHashes[c++] = indexEntry.getUrlHash();
}
wordIndex.removeEntries(indexContainers[i].wordHash(), urlHashes, true);
indexEntity = wordIndex.getEntity(indexContainers[i].wordHash(), true, -1);
sz = indexEntity.size();
// indexEntity.close();
closeTransferIndex(indexEntity);
log.logFine("Deleted partial index (" + c + " URLs) for word " + indexContainers[i].wordHash() + "; " + sz + " entries left");
indexContainers[i] = null;
}
return success;
}
/*
boolean deleteTransferIndexes(plasmaWordIndexEntity[] indexEntities) throws IOException { boolean deleteTransferIndexes(plasmaWordIndexEntity[] indexEntities) throws IOException {
Iterator urlIter; Iterator urlIter;
plasmaWordIndexEntry indexEntry; plasmaWordIndexEntry indexEntry;
@ -500,13 +494,6 @@ public final class plasmaWordIndexDistribution {
// indexEntity.close(); // indexEntity.close();
closeTransferIndex(indexEntity); closeTransferIndex(indexEntity);
log.logFine("Deleted partial index (" + c + " URLs) for word " + indexEntities[i].wordHash() + "; " + sz + " entries left"); log.logFine("Deleted partial index (" + c + " URLs) for word " + indexEntities[i].wordHash() + "; " + sz + " entries left");
// DEBUG: now try to delete the remaining index. If this works, this routine is fine
/*
if (wordIndex.getEntity(indexEntities[i].wordHash()).deleteComplete())
System.out.println("DEBUG: trial delete of partial word index " + indexEntities[i].wordHash() + " SUCCESSFULL");
else
System.out.println("DEBUG: trial delete of partial word index " + indexEntities[i].wordHash() + " FAILED");
*/
// end debug // end debug
indexEntities[i].close(); indexEntities[i].close();
} else { } else {
@ -516,7 +503,7 @@ public final class plasmaWordIndexDistribution {
} else { } else {
indexEntities[i].close(); indexEntities[i].close();
// have another try... // have another try...
if (!(plasmaWordIndexEntity.wordHash2path(wordIndex.getRoot() /*PLASMADB*/, indexEntities[i].wordHash()).delete())) { if (!(plasmaWordIndexEntity.wordHash2path(wordIndex.getRoot(), indexEntities[i].wordHash()).delete())) {
success = false; success = false;
log.logSevere("Could not delete whole index for word " + indexEntities[i].wordHash()); log.logSevere("Could not delete whole index for word " + indexEntities[i].wordHash());
} }
@ -526,6 +513,7 @@ public final class plasmaWordIndexDistribution {
} }
return success; return success;
} }
*/
public void startTransferWholeIndex(yacySeed seed, boolean delete) { public void startTransferWholeIndex(yacySeed seed, boolean delete) {
if (transferIdxThread == null) { if (transferIdxThread == null) {
@ -573,14 +561,14 @@ public final class plasmaWordIndexDistribution {
// word chunk // word chunk
private String endPointHash; private String endPointHash;
private String startPointHash; private String startPointHash;
plasmaWordIndexEntity[] indexEntities; plasmaWordIndexEntryContainer[] indexContainers;
// other fields // other fields
HashMap urlCache; HashMap urlCache;
public transferIndexWorkerThread( public transferIndexWorkerThread(
yacySeed seed, yacySeed seed,
plasmaWordIndexEntity[] indexEntities, plasmaWordIndexEntryContainer[] indexContainers,
HashMap urlCache, HashMap urlCache,
boolean gzipBody, boolean gzipBody,
int timeout, int timeout,
@ -594,7 +582,7 @@ public final class plasmaWordIndexDistribution {
this.timeout4Transfer = timeout; this.timeout4Transfer = timeout;
this.iteration = iteration; this.iteration = iteration;
this.seed = seed; this.seed = seed;
this.indexEntities = indexEntities; this.indexContainers = indexContainers;
this.urlCache = urlCache; this.urlCache = urlCache;
this.idxCount = idxCount; this.idxCount = idxCount;
this.chunkSize = chunkSize; this.chunkSize = chunkSize;
@ -657,11 +645,11 @@ public final class plasmaWordIndexDistribution {
// transfering seleted words to remote peer // transfering seleted words to remote peer
this.status = "Running: Transfering chunk " + iteration; this.status = "Running: Transfering chunk " + iteration;
String error = yacyClient.transferIndex(seed, indexEntities, urlCache, gzipBody4Transfer, timeout4Transfer); String error = yacyClient.transferIndex(seed, indexContainers, urlCache, gzipBody4Transfer, timeout4Transfer);
if (error == null) { if (error == null) {
// words successfully transfered // words successfully transfered
transferTime = System.currentTimeMillis() - start; transferTime = System.currentTimeMillis() - start;
plasmaWordIndexDistribution.this.log.logInfo("Index transfer of " + idxCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length-1].wordHash() + "]" + plasmaWordIndexDistribution.this.log.logInfo("Index transfer of " + idxCount + " words [" + indexContainers[0].wordHash() + " .. " + indexContainers[indexContainers.length-1].wordHash() + "]" +
" to peer " + seed.getName() + ":" + seed.hash + " in " + (transferTime/1000) + " seconds successfull (" + " to peer " + seed.getName() + ":" + seed.hash + " in " + (transferTime/1000) + " seconds successfull (" +
(1000 * idxCount / (transferTime + 1)) + " words/s)"); (1000 * idxCount / (transferTime + 1)) + " words/s)");
retryCount = 0; retryCount = 0;
@ -817,7 +805,7 @@ public final class plasmaWordIndexDistribution {
} }
public void performTransferWholeIndex() { public void performTransferWholeIndex() {
plasmaWordIndexEntity[] newIndexEntities = null, oldIndexEntities = null; plasmaWordIndexEntryContainer[] newIndexContainers = null, oldIndexContainers = null;
try { try {
// pausing the regular index distribution // pausing the regular index distribution
// TODO: adding sync, to wait for a still running index distribution to finish // TODO: adding sync, to wait for a still running index distribution to finish
@ -838,12 +826,12 @@ public final class plasmaWordIndexDistribution {
iteration++; iteration++;
int idxCount = 0; int idxCount = 0;
selectionStart = System.currentTimeMillis(); selectionStart = System.currentTimeMillis();
oldIndexEntities = newIndexEntities; oldIndexContainers = newIndexContainers;
// selecting 500 words to transfer // selecting 500 words to transfer
this.status = "Running: Selecting chunk " + iteration; this.status = "Running: Selecting chunk " + iteration;
Object[] selectResult = selectTransferIndexes(this.startPointHash, this.chunkSize, this.maxOpenFiles4Transfer - openedFiles.intValue()); Object[] selectResult = selectTransferContainers(this.startPointHash, this.chunkSize, this.maxOpenFiles4Transfer - openedFiles.intValue());
newIndexEntities = (plasmaWordIndexEntity[]) selectResult[0]; newIndexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0];
HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry
openedFiles = (Integer) selectResult[2]; openedFiles = (Integer) selectResult[2];
@ -851,7 +839,7 @@ public final class plasmaWordIndexDistribution {
* a) no words are left in the index * a) no words are left in the index
* b) max open file limit was exceeded * b) max open file limit was exceeded
*/ */
if ((newIndexEntities == null) || (newIndexEntities.length == 0)) { if ((newIndexContainers == null) || (newIndexContainers.length == 0)) {
if (sb.wordIndex.size() > 0) { if (sb.wordIndex.size() > 0) {
// if there are still words in the index we try it again now // if there are still words in the index we try it again now
startPointHash = "------------"; startPointHash = "------------";
@ -863,15 +851,15 @@ public final class plasmaWordIndexDistribution {
} }
} else { } else {
// count the indexes again, can be smaller as expected // count the indexes again, can be smaller as expected
for (int i = 0; i < newIndexEntities.length; i++) idxCount += newIndexEntities[i].size(); for (int i = 0; i < newIndexContainers.length; i++) idxCount += newIndexContainers[i].size();
// getting start point for next DHT-selection // getting start point for next DHT-selection
oldStartingPointHash = startPointHash; oldStartingPointHash = startPointHash;
startPointHash = newIndexEntities[newIndexEntities.length - 1].wordHash(); // DHT targets must have greater hashes startPointHash = newIndexContainers[newIndexContainers.length - 1].wordHash(); // DHT targets must have greater hashes
selectionEnd = System.currentTimeMillis(); selectionEnd = System.currentTimeMillis();
selectionTime = selectionEnd - selectionStart; selectionTime = selectionEnd - selectionStart;
plasmaWordIndexDistribution.this.log.logInfo("Index selection of " + idxCount + " words [" + newIndexEntities[0].wordHash() + " .. " + newIndexEntities[newIndexEntities.length-1].wordHash() + "]" + plasmaWordIndexDistribution.this.log.logInfo("Index selection of " + idxCount + " words [" + newIndexContainers[0].wordHash() + " .. " + newIndexContainers[newIndexContainers.length-1].wordHash() + "]" +
" in " + " in " +
(selectionTime / 1000) + " seconds (" + (selectionTime / 1000) + " seconds (" +
(1000 * idxCount / (selectionTime+1)) + " words/s)"); (1000 * idxCount / (selectionTime+1)) + " words/s)");
@ -886,10 +874,10 @@ public final class plasmaWordIndexDistribution {
this.status = "Aborted because of Transfer error:\n" + worker.getStatus(); this.status = "Aborted because of Transfer error:\n" + worker.getStatus();
// cleanup. closing all open files // cleanup. closing all open files
closeEntities(oldIndexEntities); closeContainers(oldIndexContainers);
oldIndexEntities = null; oldIndexContainers = null;
closeEntities(newIndexEntities); closeContainers(newIndexContainers);
newIndexEntities = null; newIndexContainers = null;
// abort index transfer // abort index transfer
return; return;
@ -922,10 +910,10 @@ public final class plasmaWordIndexDistribution {
if (delete) { if (delete) {
this.status = "Running: Deleting chunk " + iteration; this.status = "Running: Deleting chunk " + iteration;
try { try {
if (deleteTransferIndexes(oldIndexEntities)) { if (deleteTransferIndexes(oldIndexContainers)) {
plasmaWordIndexDistribution.this.log.logFine("Deleted all " + oldIndexEntities.length + " transferred whole-word indexes locally"); plasmaWordIndexDistribution.this.log.logFine("Deleted all " + oldIndexContainers.length + " transferred whole-word indexes locally");
transferedEntryCount += idxCount; transferedEntryCount += idxCount;
transferedEntityCount += oldIndexEntities.length; transferedEntityCount += oldIndexContainers.length;
} else { } else {
plasmaWordIndexDistribution.this.log.logSevere("Deleted not all transferred whole-word indexes"); plasmaWordIndexDistribution.this.log.logSevere("Deleted not all transferred whole-word indexes");
} }
@ -933,18 +921,18 @@ public final class plasmaWordIndexDistribution {
plasmaWordIndexDistribution.this.log.logSevere("Deletion of indexes not possible:" + ee.getMessage(), ee); plasmaWordIndexDistribution.this.log.logSevere("Deletion of indexes not possible:" + ee.getMessage(), ee);
} }
} else { } else {
this.closeEntities(oldIndexEntities); this.closeContainers(oldIndexContainers);
transferedEntryCount += idxCount; transferedEntryCount += idxCount;
transferedEntityCount += oldIndexEntities.length; transferedEntityCount += oldIndexContainers.length;
} }
oldIndexEntities = null; oldIndexContainers = null;
} }
this.worker = null; this.worker = null;
} }
// handover chunk to transfer worker // handover chunk to transfer worker
if (!((newIndexEntities == null) || (newIndexEntities.length == 0))) { if (!((newIndexContainers == null) || (newIndexContainers.length == 0))) {
worker = new transferIndexWorkerThread(seed,newIndexEntities,urlCache,gzipBody4Transfer,timeout4Transfer,iteration,idxCount,idxCount,startPointHash,oldStartingPointHash); worker = new transferIndexWorkerThread(seed,newIndexContainers,urlCache,gzipBody4Transfer,timeout4Transfer,iteration,idxCount,idxCount,startPointHash,oldStartingPointHash);
worker.start(); worker.start();
} }
} }
@ -961,30 +949,21 @@ public final class plasmaWordIndexDistribution {
try {worker.join();}catch(Exception e){} try {worker.join();}catch(Exception e){}
// worker = null; // worker = null;
} }
if (oldIndexEntities != null) closeEntities(oldIndexEntities); if (oldIndexContainers != null) closeContainers(oldIndexContainers);
if (newIndexEntities != null) closeEntities(newIndexEntities); if (newIndexContainers != null) closeContainers(newIndexContainers);
plasmaWordIndexDistribution.this.paused = false; plasmaWordIndexDistribution.this.paused = false;
} }
} }
private void closeEntities(plasmaWordIndexEntity[] indexEntities) { private void closeContainers(plasmaWordIndexEntryContainer[] indexContainers) {
if ((indexEntities == null)||(indexEntities.length ==0)) return; if ((indexContainers == null)||(indexContainers.length ==0)) return;
for (int i = 0; i < indexEntities.length; i++) try {
indexEntities[i].close();
} catch (IOException ee) {}
}
/* for (int i = 0; i < indexContainers.length; i++) {
private boolean isAborted() { indexContainers[i] = null;
if (finished || Thread.currentThread().isInterrupted()) {
this.status = "aborted";
return true;
} }
return false;
} }
*/
} }
} }

@ -48,7 +48,6 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.Iterator; import java.util.Iterator;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.Set;
import de.anomic.kelondro.kelondroRecords; import de.anomic.kelondro.kelondroRecords;
import de.anomic.kelondro.kelondroTree; import de.anomic.kelondro.kelondroTree;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
@ -111,6 +110,7 @@ public final class plasmaWordIndexEntity {
hash.substring(4,6) + "/" + hash + ".db"); hash.substring(4,6) + "/" + hash + ".db");
} }
/*
public plasmaWordIndexEntity(String wordHash) { public plasmaWordIndexEntity(String wordHash) {
// this creates a nameless temporary index. It is needed for combined search // this creates a nameless temporary index. It is needed for combined search
// and used to hold the intersection of two indexes // and used to hold the intersection of two indexes
@ -121,7 +121,7 @@ public final class plasmaWordIndexEntity {
theLocation = null; theLocation = null;
theTmpMap = new TreeMap(); theTmpMap = new TreeMap();
} }
*/
public boolean isTMPEntity() { public boolean isTMPEntity() {
return theTmpMap != null; return theTmpMap != null;
} }
@ -302,12 +302,6 @@ public final class plasmaWordIndexEntity {
else return "EMPTY"; else return "EMPTY";
} }
// join methods
private static int log2(int x) {
int l = 0;
while (x > 0) {x = x >> 1; l++;}
return l;
}
public void merge(plasmaWordIndexEntity otherEntity, long time) throws IOException { public void merge(plasmaWordIndexEntity otherEntity, long time) throws IOException {
// this is a merge of another entity to this entity // this is a merge of another entity to this entity
@ -324,6 +318,14 @@ public final class plasmaWordIndexEntity {
} }
} }
/*
// join methods
private static int log2(int x) {
int l = 0;
while (x > 0) {x = x >> 1; l++;}
return l;
}
public static plasmaWordIndexEntity joinEntities(Set entities, long time) throws IOException { public static plasmaWordIndexEntity joinEntities(Set entities, long time) throws IOException {
// big problem here: there cannot be a time-out for join, since a time-out will leave the joined set too big. // big problem here: there cannot be a time-out for join, since a time-out will leave the joined set too big.
@ -485,5 +487,5 @@ public final class plasmaWordIndexEntity {
} }
return conj; return conj;
} }
*/
} }

@ -112,6 +112,9 @@ public final class plasmaWordIndexEntry {
public static final int AP_IMG = 9; // tag inside image references public static final int AP_IMG = 9; // tag inside image references
public static final int AP_TAG = 10; // for tagged indexeing (i.e. using mp3 tags) public static final int AP_TAG = 10; // for tagged indexeing (i.e. using mp3 tags)
public static final int AP_ANCHOR = 11; // anchor description public static final int AP_ANCHOR = 11; // anchor description
public static final int AP_BOLD = 12;
public static final int AP_ITALICS = 13;
public static final int AP_INVISIBLE = 14; // good for spam detection
// URL attributes // URL attributes
public static final int UA_LOCAL = 0; // URL was crawled locally public static final int UA_LOCAL = 0; // URL was crawled locally
@ -208,6 +211,8 @@ public final class plasmaWordIndexEntry {
// the class instantiation can only be done by a plasmaStore method // the class instantiation can only be done by a plasmaStore method
// therefore they are all public // therefore they are all public
public plasmaWordIndexEntry(String urlHash, public plasmaWordIndexEntry(String urlHash,
int urlLength, // byte-length of complete URL
int urlComps, // number of path components
int hitcount, //*how often appears this word in the text int hitcount, //*how often appears this word in the text
int wordcount, //*total number of words int wordcount, //*total number of words
int phrasecount, //*total number of phrases int phrasecount, //*total number of phrases
@ -227,14 +232,9 @@ public final class plasmaWordIndexEntry {
// more needed attributes: // more needed attributes:
// - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag etc // - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag etc
// - boolean: URL attributes // - boolean: URL attributes
// - int: url-length (shorter are better)
// - int: url-number of components / length of path
// - int: length of description tag / title tag (longer are better) // - int: length of description tag / title tag (longer are better)
// - int: number of chapters
// - int: # of outlinks to same domain // - int: # of outlinks to same domain
// - int: # of outlinks to outside domain // - int: # of outlinks to outside domain
// - int: length of description
// - int: length of title
// - int: # of keywords // - int: # of keywords
if ((language == null) || (language.length() != plasmaURL.urlLanguageLength)) language = "uk"; if ((language == null) || (language.length() != plasmaURL.urlLanguageLength)) language = "uk";

@ -54,12 +54,14 @@ package de.anomic.plasma;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.Set;
import java.util.TreeMap;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
public final class plasmaWordIndexEntryContainer implements Comparable { public final class plasmaWordIndexEntryContainer implements Comparable {
private final String wordHash; private String wordHash;
private final HashMap container; // urlHash/plasmaWordIndexEntry - Mapping private final HashMap container; // urlHash/plasmaWordIndexEntry - Mapping
private long updateTime; private long updateTime;
@ -73,6 +75,15 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
container = new HashMap(initContainerSize); // a urlhash/plasmaWordIndexEntry - relation container = new HashMap(initContainerSize); // a urlhash/plasmaWordIndexEntry - relation
} }
public void setWordHash(String newWordHash) {
// this is used to replicate a container for different word indexes during global search
this.wordHash = newWordHash;
}
public void clear() {
container.clear();
}
public int size() { public int size() {
return container.size(); return container.size();
} }
@ -85,14 +96,18 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
return wordHash; return wordHash;
} }
public int add(plasmaWordIndexEntry entry) {
return add(entry, System.currentTimeMillis());
}
public int add(plasmaWordIndexEntry entry, long updateTime) { public int add(plasmaWordIndexEntry entry, long updateTime) {
this.updateTime = java.lang.Math.max(this.updateTime, updateTime); this.updateTime = java.lang.Math.max(this.updateTime, updateTime);
return (add(entry)) ? 1 : 0; return (addi(entry)) ? 1 : 0;
} }
public int add(plasmaWordIndexEntry[] entries, long updateTime) { public int add(plasmaWordIndexEntry[] entries, long updateTime) {
int c = 0; int c = 0;
for (int i = 0; i < entries.length; i++) if (add(entries[i])) c++; for (int i = 0; i < entries.length; i++) if (addi(entries[i])) c++;
this.updateTime = java.lang.Math.max(this.updateTime, updateTime); this.updateTime = java.lang.Math.max(this.updateTime, updateTime);
return c; return c;
} }
@ -102,13 +117,13 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
Iterator i = c.entries(); Iterator i = c.entries();
int x = 0; int x = 0;
while (i.hasNext()) { while (i.hasNext()) {
if (add((plasmaWordIndexEntry) i.next())) x++; if (addi((plasmaWordIndexEntry) i.next())) x++;
} }
this.updateTime = java.lang.Math.max(this.updateTime, c.updateTime); this.updateTime = java.lang.Math.max(this.updateTime, c.updateTime);
return x; return x;
} }
private boolean add(plasmaWordIndexEntry entry) { private boolean addi(plasmaWordIndexEntry entry) {
// returns true if the new entry was added, false if it already existet // returns true if the new entry was added, false if it already existet
return (container.put(entry.getUrlHash(), entry) == null); return (container.put(entry.getUrlHash(), entry) == null);
} }
@ -117,10 +132,18 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
return container.containsKey(urlHash); return container.containsKey(urlHash);
} }
public plasmaWordIndexEntry get(String urlHash) {
return (plasmaWordIndexEntry) container.get(urlHash);
}
public plasmaWordIndexEntry[] getEntryArray() { public plasmaWordIndexEntry[] getEntryArray() {
return (plasmaWordIndexEntry[]) container.values().toArray(); return (plasmaWordIndexEntry[]) container.values().toArray();
} }
public plasmaWordIndexEntry remove(String urlHash) {
return (plasmaWordIndexEntry) container.remove(urlHash);
}
public Iterator entries() { public Iterator entries() {
// returns an iterator of plasmaWordIndexEntry objects // returns an iterator of plasmaWordIndexEntry objects
return container.values().iterator(); return container.values().iterator();
@ -146,4 +169,126 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
return (int) kelondroBase64Order.enhancedCoder.decodeLong(this.wordHash.substring(0, 4)); return (int) kelondroBase64Order.enhancedCoder.decodeLong(this.wordHash.substring(0, 4));
} }
public static plasmaWordIndexEntryContainer joinContainer(Set containers, long time, int maxDistance) {
long stamp = System.currentTimeMillis();
// order entities by their size
TreeMap map = new TreeMap();
plasmaWordIndexEntryContainer singleContainer;
Iterator i = containers.iterator();
int count = 0;
while (i.hasNext()) {
// get next entity:
singleContainer = (plasmaWordIndexEntryContainer) i.next();
// check result
if ((singleContainer == null) || (singleContainer.size() == 0)) return new plasmaWordIndexEntryContainer(null); // as this is a cunjunction of searches, we have no result if any word is not known
// store result in order of result size
map.put(new Long(singleContainer.size() * 1000 + count), singleContainer);
count++;
}
// check if there is any result
if (map.size() == 0) return new plasmaWordIndexEntryContainer(null); // no result, nothing found
// the map now holds the search results in order of number of hits per word
// we now must pairwise build up a conjunction of these sets
Long k = (Long) map.firstKey(); // the smallest, which means, the one with the least entries
plasmaWordIndexEntryContainer searchA, searchB, searchResult = (plasmaWordIndexEntryContainer) map.remove(k);
while ((map.size() > 0) && (searchResult.size() > 0)) {
// take the first element of map which is a result and combine it with result
k = (Long) map.firstKey(); // the next smallest...
time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis();
searchA = searchResult;
searchB = (plasmaWordIndexEntryContainer) map.remove(k);
searchResult = plasmaWordIndexEntryContainer.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1), maxDistance);
// free resources
searchA = null;
searchB = null;
}
// in 'searchResult' is now the combined search result
if (searchResult.size() == 0) return new plasmaWordIndexEntryContainer(null);
return searchResult;
}
// join methods
private static int log2(int x) {
int l = 0;
while (x > 0) {x = x >> 1; l++;}
return l;
}
public static plasmaWordIndexEntryContainer joinConstructive(plasmaWordIndexEntryContainer i1, plasmaWordIndexEntryContainer i2, long time, int maxDistance) {
if ((i1 == null) || (i2 == null)) return null;
if ((i1.size() == 0) || (i2.size() == 0)) return new plasmaWordIndexEntryContainer(null);
// decide which method to use
int high = ((i1.size() > i2.size()) ? i1.size() : i2.size());
int low = ((i1.size() > i2.size()) ? i2.size() : i1.size());
int stepsEnum = 10 * (high + low - 1);
int stepsTest = 12 * log2(high) * low;
// start most efficient method
if (stepsEnum > stepsTest) {
if (i1.size() < i2.size())
return joinConstructiveByTest(i1, i2, time, maxDistance);
else
return joinConstructiveByTest(i2, i1, time, maxDistance);
} else {
return joinConstructiveByEnumeration(i1, i2, time, maxDistance);
}
}
private static plasmaWordIndexEntryContainer joinConstructiveByTest(plasmaWordIndexEntryContainer small, plasmaWordIndexEntryContainer large, long time, int maxDistance) {
System.out.println("DEBUG: JOIN METHOD BY TEST");
plasmaWordIndexEntryContainer conj = new plasmaWordIndexEntryContainer(null); // start with empty search result
Iterator se = small.entries();
plasmaWordIndexEntry ie0, ie1;
long stamp = System.currentTimeMillis();
while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) {
ie0 = (plasmaWordIndexEntry) se.next();
ie1 = large.get(ie0.getUrlHash());
if (ie1 != null) {
// this is a hit. Calculate word distance:
ie0.combineDistance(ie1);
if (ie0.worddistance() <= maxDistance) conj.add(ie0);
}
}
return conj;
}
private static plasmaWordIndexEntryContainer joinConstructiveByEnumeration(plasmaWordIndexEntryContainer i1, plasmaWordIndexEntryContainer i2, long time, int maxDistance) {
System.out.println("DEBUG: JOIN METHOD BY ENUMERATION");
plasmaWordIndexEntryContainer conj = new plasmaWordIndexEntryContainer(null); // start with empty search result
Iterator e1 = i1.entries();
Iterator e2 = i2.entries();
int c;
if ((e1.hasNext()) && (e2.hasNext())) {
plasmaWordIndexEntry ie1;
plasmaWordIndexEntry ie2;
ie1 = (plasmaWordIndexEntry) e1.next();
ie2 = (plasmaWordIndexEntry) e2.next();
long stamp = System.currentTimeMillis();
while ((System.currentTimeMillis() - stamp) < time) {
c = ie1.getUrlHash().compareTo(ie2.getUrlHash());
if (c < 0) {
if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break;
} else if (c > 0) {
if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break;
} else {
// we have found the same urls in different searches!
ie1.combineDistance(ie2);
if (ie1.worddistance() <= maxDistance) conj.add(ie1);
if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break;
if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break;
}
}
}
return conj;
}
} }

@ -50,7 +50,8 @@ public interface plasmaWordIndexInterface {
public Iterator wordHashes(String startWordHash, boolean up); public Iterator wordHashes(String startWordHash, boolean up);
public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty, long maxTime); public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime);
public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime);
public long getUpdateTime(String wordHash); public long getUpdateTime(String wordHash);
public void deleteIndex(String wordHash); public void deleteIndex(String wordHash);

@ -50,12 +50,13 @@ import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaWordIndexEntry; import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.plasma.plasmaURLPattern; import de.anomic.plasma.plasmaURLPattern;
@ -349,10 +350,11 @@ public final class yacyClient {
public static int search( public static int search(
String wordhashes, String wordhashes,
int maxDistance,
boolean global, boolean global,
yacySeed targetPeer, yacySeed targetPeer,
plasmaCrawlLURL urlManager, plasmaCrawlLURL urlManager,
plasmaWordIndexEntity entityCache, plasmaWordIndexEntryContainer containerCache,
plasmaURLPattern blacklist, plasmaURLPattern blacklist,
plasmaSnippetCache snippets, plasmaSnippetCache snippets,
plasmaSearchProfile profile plasmaSearchProfile profile
@ -403,6 +405,7 @@ public final class yacyClient {
obj.put("ttl", "0"); obj.put("ttl", "0");
obj.put("duetime", Long.toString(duetime)); obj.put("duetime", Long.toString(duetime));
obj.put("profile", profile.targetToString()); // new duetimes splitted by specific search tasks obj.put("profile", profile.targetToString()); // new duetimes splitted by specific search tasks
obj.put("maxdist", maxDistance);
obj.put(yacySeed.MYTIME, yacyCore.universalDateShortString(new Date())); obj.put(yacySeed.MYTIME, yacyCore.universalDateShortString(new Date()));
//yacyCore.log.logDebug("yacyClient.search url=" + url); //yacyCore.log.logDebug("yacyClient.search url=" + url);
@ -460,6 +463,9 @@ public final class yacyClient {
// get one single search result // get one single search result
urlEntry = urlManager.newEntry((String) result.get("resource" + n), true); urlEntry = urlManager.newEntry((String) result.get("resource" + n), true);
if (urlEntry != null && blacklist.isListed(urlEntry.url().getHost().toLowerCase(), urlEntry.url().getPath())) { continue; } // block with backlist if (urlEntry != null && blacklist.isListed(urlEntry.url().getHost().toLowerCase(), urlEntry.url().getPath())) { continue; } // block with backlist
int urlLength = urlEntry.url().toString().length();
int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length;
urlManager.addEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2); urlManager.addEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
// save the url entry // save the url entry
final plasmaWordIndexEntry entry; final plasmaWordIndexEntry entry;
@ -467,6 +473,7 @@ public final class yacyClient {
// the old way to define words // the old way to define words
entry = new plasmaWordIndexEntry( entry = new plasmaWordIndexEntry(
urlEntry.hash(), urlEntry.hash(),
urlLength, urlComps,
urlEntry.wordCount(), urlEntry.wordCount(),
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
urlEntry.size(), urlEntry.size(),
@ -494,7 +501,7 @@ public final class yacyClient {
} }
// finally insert the containers to the index // finally insert the containers to the index
for (int m = 0; m < words; m++) { entityCache.addEntries(container[m]); } for (int m = 0; m < words; m++) { containerCache.add(container[m]); }
// generate statistics // generate statistics
long searchtime; long searchtime;
@ -841,7 +848,7 @@ public final class yacyClient {
httpHeader requestHeader) throws IOException { httpHeader requestHeader) throws IOException {
*/ */
public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntity[] indexes, HashMap urlCache, boolean gzipBody, int timeout) { public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntryContainer[] indexes, HashMap urlCache, boolean gzipBody, int timeout) {
HashMap in = transferRWI(targetSeed, indexes, gzipBody, timeout); HashMap in = transferRWI(targetSeed, indexes, gzipBody, timeout);
if (in == null) { return "no_connection_1"; } if (in == null) { return "no_connection_1"; }
@ -875,7 +882,7 @@ public final class yacyClient {
return null; return null;
} }
private static HashMap transferRWI(yacySeed targetSeed, plasmaWordIndexEntity[] indexes, boolean gzipBody, int timeout) { private static HashMap transferRWI(yacySeed targetSeed, plasmaWordIndexEntryContainer[] indexes, boolean gzipBody, int timeout) {
final String address = targetSeed.getAddress(); final String address = targetSeed.getAddress();
if (address == null) { return null; } if (address == null) { return null; }
@ -903,7 +910,7 @@ public final class yacyClient {
Iterator eenum; Iterator eenum;
plasmaWordIndexEntry entry; plasmaWordIndexEntry entry;
for (int i = 0; i < indexes.length; i++) { for (int i = 0; i < indexes.length; i++) {
eenum = indexes[i].elements(true); eenum = indexes[i].entries();
while (eenum.hasNext()) { while (eenum.hasNext()) {
entry = (plasmaWordIndexEntry) eenum.next(); entry = (plasmaWordIndexEntry) eenum.next();
entrypost.append(indexes[i].wordHash()) entrypost.append(indexes[i].wordHash())

@ -52,8 +52,8 @@ import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaURLPattern; import de.anomic.plasma.plasmaURLPattern;
import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaSearchProfile; import de.anomic.plasma.plasmaSearchProfile;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
public class yacySearch extends Thread { public class yacySearch extends Thread {
@ -61,29 +61,31 @@ public class yacySearch extends Thread {
final private Set wordhashes; final private Set wordhashes;
final private boolean global; final private boolean global;
final private plasmaCrawlLURL urlManager; final private plasmaCrawlLURL urlManager;
final private plasmaWordIndexEntity entityCache; final private plasmaWordIndexEntryContainer containerCache;
final private plasmaURLPattern blacklist; final private plasmaURLPattern blacklist;
final private plasmaSnippetCache snippetCache; final private plasmaSnippetCache snippetCache;
final private yacySeed targetPeer; final private yacySeed targetPeer;
private int links; private int links;
private int maxDistance;
final private plasmaSearchProfile profile; final private plasmaSearchProfile profile;
public yacySearch(Set wordhashes, boolean global, yacySeed targetPeer, public yacySearch(Set wordhashes, int maxDistance, boolean global, yacySeed targetPeer,
plasmaCrawlLURL urlManager, plasmaWordIndexEntity entityCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchProfile profile) { plasmaCrawlLURL urlManager, plasmaWordIndexEntryContainer containerCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchProfile profile) {
super("yacySearch_" + targetPeer.getName()); super("yacySearch_" + targetPeer.getName());
this.wordhashes = wordhashes; this.wordhashes = wordhashes;
this.global = global; this.global = global;
this.urlManager = urlManager; this.urlManager = urlManager;
this.entityCache = entityCache; this.containerCache = containerCache;
this.blacklist = blacklist; this.blacklist = blacklist;
this.snippetCache = snippetCache; this.snippetCache = snippetCache;
this.targetPeer = targetPeer; this.targetPeer = targetPeer;
this.links = -1; this.links = -1;
this.maxDistance = maxDistance;
this.profile = (plasmaSearchProfile) profile.clone(); this.profile = (plasmaSearchProfile) profile.clone();
} }
public void run() { public void run() {
this.links = yacyClient.search(set2string(wordhashes), global, targetPeer, urlManager, entityCache, blacklist, snippetCache, profile); this.links = yacyClient.search(set2string(wordhashes), maxDistance, global, targetPeer, urlManager, containerCache, blacklist, snippetCache, profile);
if (links != 0) { if (links != 0) {
//yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + links + " links for word hash " + wordhashes); //yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + links + " links for word hash " + wordhashes);
yacyCore.seedDB.mySeed.incRI(links); yacyCore.seedDB.mySeed.incRI(links);
@ -172,7 +174,7 @@ public class yacySearch extends Thread {
return result; return result;
} }
public static yacySearch[] searchHashes(Set wordhashes, plasmaCrawlLURL urlManager, plasmaWordIndexEntity entityCache, public static yacySearch[] searchHashes(Set wordhashes, int maxDist, plasmaCrawlLURL urlManager, plasmaWordIndexEntryContainer containerCache,
int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchProfile profile) { int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchProfile profile) {
// check own peer status // check own peer status
if (yacyCore.seedDB.mySeed == null || yacyCore.seedDB.mySeed.getAddress() == null) { return null; } if (yacyCore.seedDB.mySeed == null || yacyCore.seedDB.mySeed.getAddress() == null) { return null; }
@ -185,8 +187,8 @@ public class yacySearch extends Thread {
if (targets == 0) return null; if (targets == 0) return null;
yacySearch[] searchThreads = new yacySearch[targets]; yacySearch[] searchThreads = new yacySearch[targets];
for (int i = 0; i < targets; i++) { for (int i = 0; i < targets; i++) {
searchThreads[i]= new yacySearch(wordhashes, true, targetPeers[i], searchThreads[i]= new yacySearch(wordhashes, maxDist, true, targetPeers[i],
urlManager, entityCache, blacklist, snippetCache, profile); urlManager, containerCache, blacklist, snippetCache, profile);
searchThreads[i].start(); searchThreads[i].start();
try {Thread.sleep(20);} catch (InterruptedException e) {} try {Thread.sleep(20);} catch (InterruptedException e) {}
@ -216,5 +218,4 @@ public class yacySearch extends Thread {
} }
} }
} }

Loading…
Cancel
Save