changes towards the new index storage scheme:

- replaced usage of temporary IndexEntity by EntryContainer
- added more attributes to word index
- added exact-string search (using quotes in query)
- disabled writing into WORDS during search; EntryContainers are used instead


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1485 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent c81ad1bf34
commit 03c65742ba

@ -62,6 +62,7 @@ import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyClient;
@ -255,12 +256,12 @@ public class IndexControl_p {
}
prop.put("urlstring", "");
prop.put("urlhash", "");
plasmaWordIndexEntity[] indexes = new plasmaWordIndexEntity[1];
plasmaWordIndexEntryContainer[] indexes = new plasmaWordIndexEntryContainer[1];
String result;
long starttime = System.currentTimeMillis();
indexes[0] = switchboard.wordIndex.getEntity(keyhash, true, -1);
indexes[0] = switchboard.wordIndex.getContainer(keyhash, true, -1);
// built urlCache
Iterator urlIter = indexes[0].elements(true);
Iterator urlIter = indexes[0].entries();
HashMap knownURLs = new HashMap();
HashSet unknownURLEntries = new HashSet();
plasmaWordIndexEntry indexEntry;
@ -282,9 +283,7 @@ public class IndexControl_p {
// now delete all entries that have no url entry
Iterator hashIter = unknownURLEntries.iterator();
while (hashIter.hasNext()) {
try {
indexes[0].removeEntry((String) hashIter.next(), false);
} catch (IOException e) {}
indexes[0].remove((String) hashIter.next());
}
// use whats remaining
String gzipBody = switchboard.getConfig("indexControl.gzipBody","false");
@ -296,7 +295,8 @@ public class IndexControl_p {
"true".equalsIgnoreCase(gzipBody),
timeout);
prop.put("result", (result == null) ? ("Successfully transferred " + indexes[0].size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result);
try {indexes[0].close();} catch (IOException e) {}
indexes[0] = null;
indexes = null;
}
// generate list
@ -431,15 +431,15 @@ public class IndexControl_p {
public static String genUrlList(plasmaSwitchboard switchboard, String keyhash, String keystring) {
// search for a word hash and generate a list of url links
plasmaWordIndexEntity index = null;
plasmaWordIndexEntryContainer index = null;
try {
index = switchboard.wordIndex.getEntity(keyhash, true, -1);
index = switchboard.wordIndex.getContainer(keyhash, true, -1);
final StringBuffer result = new StringBuffer(1024);
if (index.size() == 0) {
result.append("No URL entries related to this word hash <span class=\"tt\">").append(keyhash).append("</span>.");
} else {
final Iterator en = index.elements(true);
final Iterator en = index.entries();
result.append("URL entries related to this word hash <span class=\"tt\">").append(keyhash).append("</span><br><br>");
result.append("<form action=\"IndexControl_p.html\" method=\"post\" enctype=\"multipart/form-data\">");
String us;
@ -497,13 +497,12 @@ public class IndexControl_p {
.append("<span class=\"small\">for every resolveable and deleted URL reference, delete the same reference at every other word where the reference exists (very extensive, but prevents further unresolved references)</span>")
.append("</td></tr></table></fieldset></form><br>");
}
index.close();
index = null;
return result.toString();
} catch (IOException e) {
return "";
} finally {
if (index != null) try { index.close(); index = null; } catch (Exception e) {};
if (index != null) index = null;
}
}

@ -463,7 +463,7 @@ public class dir {
"AAAAAAAAAAAA", /*referrer*/
0, /*copycount*/
false, /*localneed*/
condenser.RESULT_INFORMATION_VALUE,
condenser.RESULT_WORD_ENTROPHY,
"**", /*language*/
plasmaWordIndexEntry.DT_SHARE, /*doctype*/
phrase.length(), /*size*/

@ -126,7 +126,12 @@ public class index {
// SEARCH
// process search words
final String querystring = post.get("search", "");
int maxDistance = Integer.MAX_VALUE;
String querystring = post.get("search", "").trim();
if ((querystring.charAt(0) == '"') && (querystring.charAt(querystring.length() - 1) == '"')) {
querystring = querystring.substring(1, querystring.length() - 1).trim();
maxDistance = 1;
}
if (sb.facilityDB != null) try { sb.facilityDB.update("zeitgeist", querystring, post); } catch (Exception e) {}
final TreeSet query = plasmaSearchQuery.cleanQuery(querystring);
// filter out stopwords
@ -172,7 +177,7 @@ public class index {
}
// do the search
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, new String[]{order1, order2, order3}, count, searchtime, urlmask, referer,
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, maxDistance, new String[]{order1, order2, order3}, count, searchtime, urlmask, referer,
((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL,
"", 20);
final serverObjects prop = sb.searchFromLocal(thisSearch);

@ -47,7 +47,6 @@
// javac -classpath .:../../Classes search.java
// if the shell's current path is htroot/yacy
import java.io.IOException;
import java.util.HashSet;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlLURL;
@ -81,6 +80,7 @@ public final class search {
// final String fwden = post.get("fwden", ""); // forward deny, a list of seed hashes. They may NOT be target of forward hopping
final long duetime= post.getLong("duetime", 3000);
final int count = post.getInt("count", 10); // maximum number of wanted results
final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE);
// final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers
// Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time
@ -103,8 +103,8 @@ public final class search {
}
final long timestamp = System.currentTimeMillis();
plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, new String[]{plasmaSearchQuery.ORDER_YBR, plasmaSearchQuery.ORDER_DATE, plasmaSearchQuery.ORDER_QUALITY},
count, duetime, ".*");
plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, maxdist, new String[]{plasmaSearchQuery.ORDER_YBR, plasmaSearchQuery.ORDER_DATE, plasmaSearchQuery.ORDER_QUALITY},
count, duetime, ".*");
squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
serverObjects prop = new serverObjects();
@ -114,11 +114,8 @@ public final class search {
plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache);
plasmaSearchResult acc = null;
int idxc = 0;
try {
idxc = theSearch.localSearch();
acc = theSearch.order();
} catch (IOException e) {
}
idxc = theSearch.localSearch();
acc = theSearch.order();
// result is a List of urlEntry elements
if ((idxc == 0) || (acc == null)) {

@ -75,6 +75,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
linkTags0.add("img");
linkTags0.add("base");
linkTags0.add("frame");
linkTags0.add("meta");
linkTags1 = new TreeSet(insensitiveCollator);
linkTags1.add("a");
@ -88,6 +89,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
// class variables: collectors for links
private HashMap anchors;
private HashMap images;
private HashMap metas;
private String title;
//private String headline;
private List[] headlines;
@ -101,6 +103,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
this.root = root;
this.anchors = new HashMap();
this.images = new HashMap();
this.metas = new HashMap();
this.title = "";
this.headlines = new ArrayList[4];
for (int i = 0; i < 4; i++) headlines[i] = new ArrayList();
@ -193,7 +196,12 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return null;
}
}
public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
public static String[] urlComps(String normalizedURL) {
return normalizedURL.toLowerCase().split(splitrex); // word components of the url
}
private String absolutePath(String relativePath) {
try {
return urlNormalform(new URL(root, relativePath));
@ -206,6 +214,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
if (tagname.equalsIgnoreCase("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt",""));
if (tagname.equalsIgnoreCase("base")) try {root = new URL(tagopts.getProperty("href", ""));} catch (MalformedURLException e) {}
if (tagname.equalsIgnoreCase("frame")) anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name",""));
if (tagname.equalsIgnoreCase("meta")) metas.put((tagopts.getProperty("name", "")).toLowerCase(), tagopts.getProperty("content",""));
}
public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
@ -252,10 +261,16 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
// construct a title string, even if the document has no title
// if there is one, return it
if (title.length() > 0) return title;
// othervise take any headline
for (int i = 0; i < 4; i++) {
if (headlines[i].size() > 0) return (String) headlines[i].get(0);
}
// take description tag
String s = getDescription();
if (s.length() > 0) return s;
// extract headline from content
if (content.length() > 80) return cleanLine(new String(content.getBytes(), 0, 80));
return cleanLine(content.trim().toString());
@ -280,6 +295,45 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return images;
}
public Map getMetas() {
return metas;
}
public String getDescription() {
String s = (String) metas.get("description");
if (s == null) return ""; else return s;
}
public String getContentType() {
String s = (String) metas.get("content-type");
if (s == null) return ""; else return s;
}
public String getCopyright() {
String s = (String) metas.get("copyright");
if (s == null) return ""; else return s;
}
public String[] getContentLanguages() {
String s = (String) metas.get("content-language");
if (s == null) s = "";
return s.split(" |,");
}
public String[] getKeywords() {
String s = (String) metas.get("keywords");
if (s == null) s = "";
if (s.length() == 0) {
return getTitle().toLowerCase().split(splitrex);
} else {
return s.split(" |,");
}
}
/*
* (non-Javadoc)
* @see de.anomic.htmlFilter.htmlFilterScraper#close()
*/
public void close() {
// free resources
super.close();
@ -298,6 +352,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
}
System.out.println("ANCHORS :" + anchors.toString());
System.out.println("IMAGES :" + images.toString());
System.out.println("METAS :" + metas.toString());
System.out.println("TEXT :" + new String(content.getBytes()));
}

@ -850,11 +850,11 @@ public class kelondroTree extends kelondroRecords implements kelondroIndex {
this.rot = rotating;
ii = new nodeIterator(asc, rot, start);
nextNode = (ii.hasNext()) ? (Node) ii.next() : null;
if (nextNode != null) {
if ((nextNode != null) && (nextNode.getKey() != null)) {
int c = objectOrder.compare(firstKey, nextNode.getKey());
if ((c > 0) && (asc)) {
// firstKey > nextNode.getKey()
log.logWarning("CORRECTING ITERATOR: firstKey=" + new String(firstKey) + ", nextNode=" + new String(nextNode.getKey()));
if (log != null) log.logWarning("CORRECTING ITERATOR: firstKey=" + new String(firstKey) + ", nextNode=" + new String(nextNode.getKey()));
nextNode = (ii.hasNext()) ? (Node) ii.next() : null;
}
if ((c < 0) && (!(asc))) {

@ -83,9 +83,6 @@ public final class plasmaCondenser {
public int RESULT_NUMB_SENTENCES = -1;
public int RESULT_DIFF_SENTENCES = -1;
public int RESULT_SIMI_SENTENCES = -1;
public int RESULT_AVERAGE_WORD_OCC = -1;
public int RESULT_INFORMATION_VALUE = -1;
public plasmaCondenser(InputStream text) {
this(text, 3, 2);
@ -357,8 +354,7 @@ public final class plasmaCondenser {
this.RESULT_NUMB_SENTENCES = allsentencecounter;
this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
this.RESULT_SIMI_SENTENCES = sentences.size();
this.RESULT_AVERAGE_WORD_OCC = (words.size() == 0) ? 0 : (allwordcounter / words.size());
this.RESULT_INFORMATION_VALUE = (allwordcounter == 0) ? 0 : (wordenum.count() * words.size() / allwordcounter / 16);
//this.RESULT_INFORMATION_VALUE = (allwordcounter == 0) ? 0 : (wordenum.count() * words.size() / allwordcounter / 16);
}
public void print() {

@ -176,7 +176,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
}
public synchronized Entry getEntry(String hash, plasmaWordIndexEntry searchedWord) throws IOException {
public Entry getEntry(String hash, plasmaWordIndexEntry searchedWord) throws IOException {
return new Entry(hash, searchedWord);
}
@ -399,8 +399,16 @@ public final class plasmaCrawlLURL extends plasmaURL {
private int size;
private int wordCount;
private String snippet;
private plasmaWordIndexEntry word;
private plasmaWordIndexEntry word; // this is only used if the url is transported via remote search requests
// more needed attributes:
// - author / copyright owner
// - keywords
// - phrasecount, total number of phrases
// - boolean: URL attributes
// - int: # of outlinks to same domain
// - int: # of outlinks to outside domain
public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) {
// create new entry and store it into database
this.urlHash = urlHash(url);

@ -63,7 +63,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
private plasmaWordIndex wordIndex;
private plasmaCrawlLURL urlStore;
private plasmaSnippetCache snippetCache;
private plasmaWordIndexEntity rcLocal, rcGlobal; // caches for results
private plasmaWordIndexEntryContainer rcLocal, rcGlobal; // caches for results
private plasmaSearchProfile profileLocal, profileGlobal;
private yacySearch[] searchThreads;
@ -73,8 +73,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
this.query = query;
this.urlStore = urlStore;
this.snippetCache = snippetCache;
this.rcLocal = new plasmaWordIndexEntity(null);
this.rcGlobal = new plasmaWordIndexEntity(null);
this.rcLocal = new plasmaWordIndexEntryContainer(null);
this.rcGlobal = new plasmaWordIndexEntryContainer(null);
if (query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) {
this.profileLocal = new plasmaSearchProfile(4 * query.maximumTime / 10, query.wantedResults);
this.profileGlobal = new plasmaSearchProfile(6 * query.maximumTime / 10, query.wantedResults);
@ -114,68 +114,56 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
int globalContributions = globalSearch(fetchpeers);
log.logFine("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
try {
// combine the result and order
plasmaSearchResult result = order();
result.globalContributions = globalContributions;
result.localContributions = rcLocal.size();
// flush results in a separate thread
this.start(); // start to flush results
//serverInstantThread.oneTimeJob(this, "flushResults", log, 0);
// clean up
if ((rcLocal != null) && (!(rcLocal.isTMPEntity()))) rcLocal.close();
rcLocal = null;
// return search result
log.logFine("SEARCHRESULT: " + profileLocal.reportToString());
lastEvent = this;
return result;
} catch (IOException e) {
return null;
}
// combine the result and order
plasmaSearchResult result = order();
result.globalContributions = globalContributions;
result.localContributions = rcLocal.size();
// flush results in a separate thread
this.start(); // start to flush results
//serverInstantThread.oneTimeJob(this, "flushResults", log, 0);
// clean up
rcLocal = null;
// return search result
log.logFine("SEARCHRESULT: " + profileLocal.reportToString());
lastEvent = this;
return result;
} else {
// do a local search
//long start = System.currentTimeMillis();
try {
localSearch();
plasmaSearchResult result = order();
result.localContributions = rcLocal.size();
// clean up
if ((rcLocal != null) && (!(rcLocal.isTMPEntity()))) rcLocal.close();
rcLocal = null;
// return search result
log.logFine("SEARCHRESULT: " + profileLocal.reportToString());
lastEvent = this;
return result;
} catch (IOException e) {
return null;
}
localSearch();
plasmaSearchResult result = order();
result.localContributions = rcLocal.size();
// clean up
rcLocal = null;
// return search result
log.logFine("SEARCHRESULT: " + profileLocal.reportToString());
lastEvent = this;
return result;
}
}
public int localSearch() throws IOException {
public int localSearch() {
// search for the set of hashes and return an array of urlEntry elements
// retrieve entities that belong to the hashes
profileLocal.startTimer();
Set entities = wordIndex.getEntities(query.queryHashes, true, true, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_COLLECTION));
if (entities.size() < query.size()) entities = null; // prevent that only a subset is returned
Set containers = wordIndex.getContainers(query.queryHashes, true, true, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_COLLECTION));
if (containers.size() < query.size()) containers = null; // prevent that only a subset is returned
profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_COLLECTION);
profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_COLLECTION, (entities == null) ? 0 : entities.size());
profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_COLLECTION, (containers == null) ? 0 : containers.size());
// since this is a conjunction we return an empty entity if any word is not known
if (entities == null) {
rcLocal = new plasmaWordIndexEntity(null);
if (containers == null) {
rcLocal = new plasmaWordIndexEntryContainer(null);
return 0;
}
// join the result
profileLocal.startTimer();
rcLocal = plasmaWordIndexEntity.joinEntities(entities, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_JOIN));
rcLocal = plasmaWordIndexEntryContainer.joinContainer(containers, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_JOIN), query.maxDistance);
profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_JOIN);
profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_JOIN, rcLocal.size());
@ -190,7 +178,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
log.logFine("STARTING " + fetchpeers + " THREADS TO CATCH EACH " + profileGlobal.getTargetCount(plasmaSearchProfile.PROCESS_POSTSORT) + " URLs WITHIN " + (profileGlobal.duetime() / 1000) + " SECONDS");
long timeout = System.currentTimeMillis() + profileGlobal.duetime() + 4000;
searchThreads = yacySearch.searchHashes(query.queryHashes, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal);
searchThreads = yacySearch.searchHashes(query.queryHashes, query.maxDistance, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal);
// wait until wanted delay passed or wanted result appeared
while (System.currentTimeMillis() < timeout) {
@ -204,20 +192,20 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
return rcGlobal.size();
}
public plasmaSearchResult order() throws IOException {
public plasmaSearchResult order() {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
plasmaWordIndexEntity searchResult = new plasmaWordIndexEntity(null);
searchResult.merge(rcLocal, -1);
searchResult.merge(rcGlobal, -1);
plasmaWordIndexEntryContainer searchResult = new plasmaWordIndexEntryContainer(null);
searchResult.add(rcLocal);
searchResult.add(rcGlobal);
long preorderTime = profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_PRESORT);
long postorderTime = profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_POSTSORT);
profileLocal.startTimer();
plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query);
preorder.addEntity(searchResult, preorderTime);
preorder.addContainer(searchResult, preorderTime);
profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_PRESORT);
profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_PRESORT, rcLocal.size());
@ -289,19 +277,13 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
Iterator hashi = query.queryHashes.iterator();
while (hashi.hasNext()) {
wordHash = (String) hashi.next();
Iterator i = rcGlobal.elements(true);
plasmaWordIndexEntry entry;
plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash, rcGlobal.size());
while (i.hasNext()) {
entry = (plasmaWordIndexEntry) i.next();
container.add(entry, System.currentTimeMillis());
}
wordIndex.addEntries(container, true);
log.logFine("FLUSHED " + wordHash + ": " + container.size() + " url entries");
rcGlobal.setWordHash(wordHash);
wordIndex.addEntries(rcGlobal, true);
log.logFine("FLUSHED " + wordHash + ": " + rcGlobal.size() + " url entries");
}
// the rcGlobal was flushed, empty it
count += rcGlobal.size();
rcGlobal.deleteComplete();
rcGlobal.clear();
}
// wait a little bit before trying again
try {Thread.sleep(3000);} catch (InterruptedException e) {}

@ -116,8 +116,8 @@ public final class plasmaSearchPreOrder {
return (plasmaWordIndexEntry) pageAcc.remove(top);
}
public void addEntity(plasmaWordIndexEntity entity, long maxTime) {
Iterator i = entity.elements(true);
public void addContainer(plasmaWordIndexEntryContainer container, long maxTime) {
Iterator i = container.entries();
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
plasmaWordIndexEntry entry;
while (i.hasNext()) {

@ -72,12 +72,14 @@ public final class plasmaSearchQuery {
public int domType;
public String domGroupName;
public int domMaxTargets;
public int maxDistance;
public plasmaSearchQuery(Set queryWords,
public plasmaSearchQuery(Set queryWords, int maxDistance,
String[] order, int wantedResults, long maximumTime, String urlMask,
String referrer,
int domType, String domGroupName, int domMaxTargets) {
this.queryWords = queryWords;
this.maxDistance = maxDistance;
this.queryHashes = words2hashes(queryWords);
this.order = order;
this.wantedResults = wantedResults;
@ -89,9 +91,10 @@ public final class plasmaSearchQuery {
this.domMaxTargets = domMaxTargets;
}
public plasmaSearchQuery(Set queryHashes,
public plasmaSearchQuery(Set queryHashes, int maxDistance,
String[] order, int wantedResults, long maximumTime, String urlMask) {
this.queryWords = null;
this.maxDistance = maxDistance;
this.queryHashes = queryHashes;
this.order = order;
this.wantedResults = wantedResults;

@ -54,11 +54,10 @@ import java.net.MalformedURLException;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.serverCodings;
import de.anomic.htmlFilter.htmlFilterContentScraper;
public final class plasmaSearchResult {
public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic
private ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects
@ -111,8 +110,8 @@ public final class plasmaSearchResult {
URL url = page.url();
String descr = page.descr();
if ((url == null) || (descr == null)) return;
String[] urlcomps = url.toString().toLowerCase().split(splitrex); // word components of the url
String[] descrcomps = descr.toLowerCase().split(splitrex); // words in the description
String[] urlcomps = htmlFilterContentScraper.urlComps(url.toString()); // word components of the url
String[] descrcomps = descr.toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description
// store everything
Object[] resultVector = new Object[] {indexEntry, page, urlcomps, descrcomps};

@ -1285,7 +1285,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
yacyCore.seedDB.mySeed.hash,
referrerHash,
0, true,
condenser.RESULT_INFORMATION_VALUE,
condenser.RESULT_WORD_ENTROPHY,
plasmaWordIndexEntry.language(entry.url()),
plasmaWordIndexEntry.docType(document.getMimeType()),
(int) entry.size(),
@ -1313,15 +1313,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} else {
HashMap urlCache = new HashMap(1);
urlCache.put(newEntry.hash(),newEntry);
ArrayList tmpEntities = new ArrayList(condenser.RESULT_SIMI_WORDS);
ArrayList tmpContainers = new ArrayList(condenser.RESULT_SIMI_WORDS);
String language = plasmaWordIndexEntry.language(entry.url());
char doctype = plasmaWordIndexEntry.docType(document.getMimeType());
int quality = 0;
try {
quality = condenser.RESULT_INFORMATION_VALUE;
} catch (NumberFormatException e) {
System.out.println("INTERNAL ERROR WITH CONDENSER.INFORMATION_VALUE: " + e.toString() + ": in URL " + newEntry.url().toString());
}
int urlLength = newEntry.url().toString().length();
int urlComps = htmlFilterContentScraper.urlComps(newEntry.url().toString()).length;
// iterate over all words
Iterator i = condenser.words();
@ -1332,8 +1328,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String word = (String) wentry.getKey();
wordStat = (plasmaCondenser.wordStatProp) wentry.getValue();
String wordHash = plasmaWordIndexEntry.word2hash(word);
plasmaWordIndexEntity wordIdxEntity = new plasmaWordIndexEntity(wordHash);
plasmaWordIndexEntryContainer wordIdxContainer = new plasmaWordIndexEntryContainer(wordHash);
plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash,
urlLength, urlComps,
wordStat.count,
condenser.RESULT_SIMI_WORDS,
condenser.RESULT_SIMI_SENTENCES,
@ -1344,26 +1341,25 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
newEntry.size(),
docDate.getTime(),
System.currentTimeMillis(),
quality, language, doctype, true);
wordIdxEntity.addEntry(wordIdxEntry);
tmpEntities.add(wordIdxEntity);
condenser.RESULT_WORD_ENTROPHY,
language,
doctype,
true);
wordIdxContainer.add(wordIdxEntry);
tmpContainers.add(wordIdxContainer);
// wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry));
}
//System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries");
words = condenser.RESULT_SIMI_WORDS;
// transfering the index to the storage peer
String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntity[])tmpEntities.toArray(new plasmaWordIndexEntity[tmpEntities.size()]),urlCache,true,120000);
String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntryContainer[])tmpContainers.toArray(new plasmaWordIndexEntity[tmpContainers.size()]),urlCache,true,120000);
if (error != null) {
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()));
}
// cleanup
for (int j=0; j < tmpEntities.size(); j++) {
plasmaWordIndexEntity tmpEntity = (plasmaWordIndexEntity) tmpEntities.get(j);
try { tmpEntity.close(); } catch (Exception e) {}
}
tmpContainers = null;
}
storageEndTime = System.currentTimeMillis();

@ -56,6 +56,7 @@ import java.util.Set;
import java.util.Date;
import java.net.URL;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.server.logging.serverLog;
@ -136,16 +137,8 @@ public final class plasmaWordIndex {
public int addPageIndex(URL url, String urlHash, Date urlModified, int size, plasmaCondenser condenser, String language, char doctype) {
// this is called by the switchboard to put in a new page into the index
// use all the words in one condenser object to simultanous create index
// entries
// int age = microDateDays(urlModified);
int quality = 0;
try {
quality = condenser.RESULT_INFORMATION_VALUE;
} catch (NumberFormatException e) {
System.out.println("INTERNAL ERROR WITH CONDENSER.INFORMATION_VALUE: " + e.toString() + ": in URL " + url.toString());
}
// use all the words in one condenser object to simultanous create index entries
// iterate over all words
Iterator i = condenser.words();
Map.Entry wentry;
@ -153,6 +146,9 @@ public final class plasmaWordIndex {
plasmaWordIndexEntry ientry;
plasmaCondenser.wordStatProp wprop;
String wordHash;
int urlLength = url.toString().length();
int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length;
while (i.hasNext()) {
wentry = (Map.Entry) i.next();
word = (String) wentry.getKey();
@ -160,6 +156,7 @@ public final class plasmaWordIndex {
// if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
wordHash = plasmaWordIndexEntry.word2hash(word);
ientry = new plasmaWordIndexEntry(urlHash,
urlLength, urlComps,
wprop.count,
condenser.RESULT_SIMI_WORDS,
condenser.RESULT_SIMI_SENTENCES,
@ -170,18 +167,54 @@ public final class plasmaWordIndex {
size,
urlModified.getTime(),
System.currentTimeMillis(),
quality, language, doctype, true);
condenser.RESULT_WORD_ENTROPHY,
language,
doctype,
true);
addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), ientry), false);
}
// System.out.println("DEBUG: plasmaSearch.addPageIndex: added " +
// condenser.getWords().size() + " words, flushed " + c + " entries");
return condenser.RESULT_SIMI_WORDS;
}
public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
return ramCache.getContainer(wordHash, deleteIfEmpty, maxTime);
}
public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) {
return ramCache.getIndex(wordHash, deleteIfEmpty, maxTime);
return ramCache.getEntity(wordHash, deleteIfEmpty, maxTime);
}
public Set getContainers(Set wordHashes, boolean deleteIfEmpty, boolean interruptIfEmpty, long maxTime) {
// retrieve entities that belong to the hashes
HashSet containers = new HashSet();
String singleHash;
plasmaWordIndexEntryContainer singleContainer;
Iterator i = wordHashes.iterator();
long start = System.currentTimeMillis();
long remaining;
while (i.hasNext()) {
// check time
remaining = maxTime - (System.currentTimeMillis() - start);
//if ((maxTime > 0) && (remaining <= 0)) break;
// get next hash:
singleHash = (String) i.next();
// retrieve index
singleContainer = getContainer(singleHash, deleteIfEmpty, (maxTime < 0) ? -1 : remaining / (wordHashes.size() - containers.size()));
// check result
if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashSet();
containers.add(singleContainer);
}
return containers;
}
/*
public Set getEntities(Set wordHashes, boolean deleteIfEmpty, boolean interruptIfEmpty, long maxTime) {
// retrieve entities that belong to the hashes
@ -203,13 +236,14 @@ public final class plasmaWordIndex {
singleEntity = getEntity(singleHash, deleteIfEmpty, (maxTime < 0) ? -1 : remaining / (wordHashes.size() - entities.size()));
// check result
if (((singleEntity == null) || (singleEntity.size() == 0)) && (interruptIfEmpty)) return null;
if (((singleEntity == null) || (singleEntity.size() == 0)) && (interruptIfEmpty)) return new HashSet();
entities.add(singleEntity);
}
return entities;
}
*/
public int size() {
return ramCache.size();
}

@ -203,7 +203,7 @@ public final class plasmaWordIndexAssortmentCluster {
}
public plasmaWordIndexEntryContainer removeFromAll(String wordHash, long maxTime) {
// collect all records from all the assortments and return them
// removes all records from all the assortments and return them
plasmaWordIndexEntryContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash);
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
for (int i = 0; i < clusterCount; i++) {
@ -214,6 +214,18 @@ public final class plasmaWordIndexAssortmentCluster {
return record;
}
public plasmaWordIndexEntryContainer getFromAll(String wordHash, long maxTime) {
// collect all records from all the assortments and return them
plasmaWordIndexEntryContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash);
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
for (int i = 0; i < clusterCount; i++) {
buffer = assortments[i].get(wordHash);
if (buffer != null) record.add(buffer);
if (System.currentTimeMillis() > limitTime) break;
}
return record;
}
public Iterator hashConjunction(String startWordHash, boolean up, boolean rot) {
HashSet iterators = new HashSet();
//if (rot) System.out.println("WARNING: kelondroMergeIterator does not work correctly when individual iterators rotate on their own!");

@ -391,7 +391,18 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
}
}
public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty, long maxTime) {
public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
long start = System.currentTimeMillis();
if (maxTime > 0) maxTime = 8 * maxTime / 10; // reserve time for later adding to backend
plasmaWordIndexEntryContainer container = assortmentCluster.getFromAll(wordHash, maxTime);
if (container == null) {
container = new plasmaWordIndexEntryContainer(wordHash);
}
container.add(backend.getContainer(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : System.currentTimeMillis() - start));
return container;
}
public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) {
// this possibly creates an index file in the back-end
// the index file is opened and returned as entity object
long start = System.currentTimeMillis();
@ -406,7 +417,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
}
}
long r = maxTime - (System.currentTimeMillis() - start);
return backend.getIndex(wordHash, deleteIfEmpty, (r < 0) ? 0 : r);
return backend.getEntity(wordHash, deleteIfEmpty, (r < 0) ? 0 : r);
}
public long getUpdateTime(String wordHash) {

@ -181,7 +181,24 @@ public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface {
}
}
public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty, long maxTime) {
public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
long start = System.currentTimeMillis();
if (plasmaWordIndexEntity.wordHash2path(databaseRoot, wordHash).exists()) {
plasmaWordIndexEntity entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10);
plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash);
plasmaWordIndexEntry entry;
Iterator i = entity.elements(true);
while ((i.hasNext()) && ((maxTime < 0) || (System.currentTimeMillis() < start + maxTime))) {
entry = (plasmaWordIndexEntry) i.next();
container.add(entry);
}
return container;
} else {
return new plasmaWordIndexEntryContainer(wordHash, 0);
}
}
public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) {
return new plasmaWordIndexEntity(databaseRoot, wordHash, deleteIfEmpty);
}
@ -190,7 +207,6 @@ public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface {
if (f.exists()) return f.lastModified(); else return -1;
}
public void deleteIndex(String wordHash) {
plasmaWordIndexEntity.removePlasmaIndex(databaseRoot, wordHash);
}
@ -200,7 +216,7 @@ public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface {
plasmaWordIndexEntity pi = null;
int count = 0;
try {
pi = getIndex(wordHash, true, -1);
pi = getEntity(wordHash, true, -1);
for (int i = 0; i < urlHashes.length; i++)
if (pi.removeEntry(urlHashes[i], deleteComplete)) count++;
int size = pi.size();

@ -201,33 +201,33 @@ public final class plasmaWordIndexDistribution {
// collect index
String startPointHash = selectTransferStart();
log.logFine("Selected hash " + startPointHash + " as start point for index distribution, distance = " + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, startPointHash));
Object[] selectResult = selectTransferIndexes(startPointHash, indexCount, this.maxOpenFiles4Distribution);
plasmaWordIndexEntity[] indexEntities = (plasmaWordIndexEntity[]) selectResult[0];
Object[] selectResult = selectTransferContainers(startPointHash, indexCount, this.maxOpenFiles4Distribution);
plasmaWordIndexEntryContainer[] indexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0];
//Integer openedFiles = (Integer) selectResult[2];
HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry
if ((indexEntities == null) || (indexEntities.length == 0)) {
if ((indexContainers == null) || (indexContainers.length == 0)) {
log.logFine("No index available for index transfer, hash start-point " + startPointHash);
return -1;
}
// count the indexes again, can be smaller as expected
indexCount = 0;
for (int i = 0; i < indexEntities.length; i++) {
indexCount += indexEntities[i].size();
for (int i = 0; i < indexContainers.length; i++) {
indexCount += indexContainers[i].size();
}
if (indexCount < 50) {
log.logFine("Too few (" + indexCount + ") indexes selected for transfer.");
closeTransferIndexes (indexEntities);
closeTransferIndexes(indexContainers);
return -1; // failed
}
// find start point for DHT-selection
String keyhash = indexEntities[indexEntities.length - 1].wordHash(); // DHT targets must have greater hashes
String keyhash = indexContainers[indexContainers.length - 1].wordHash(); // DHT targets must have greater hashes
// find a list of DHT-peers
yacySeed[] seeds = new yacySeed[peerCount + 10];
int hc0 = 0;
double ownDistance = Math.min(yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexEntities[0].wordHash()),
yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexEntities[indexEntities.length - 1].wordHash()));
double ownDistance = Math.min(yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexContainers[0].wordHash()),
yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexContainers[indexContainers.length - 1].wordHash()));
double maxDistance = Math.min(ownDistance, 0.4);
synchronized (yacyCore.dhtAgent) {
double avdist;
@ -239,8 +239,8 @@ public final class plasmaWordIndexDistribution {
}
seeds[hc0] = (yacySeed) e.nextElement();
if (seeds[hc0] != null) {
avdist = Math.max(yacyDHTAction.dhtDistance(seeds[hc0].hash, indexEntities[0].wordHash()),
yacyDHTAction.dhtDistance(seeds[hc0].hash, indexEntities[indexEntities.length - 1].wordHash()));
avdist = Math.max(yacyDHTAction.dhtDistance(seeds[hc0].hash, indexContainers[0].wordHash()),
yacyDHTAction.dhtDistance(seeds[hc0].hash, indexContainers[indexContainers.length - 1].wordHash()));
if (avdist < maxDistance) {
log.logInfo("Selected " + ((hc0 < peerCount) ? "primary" : "reserve") + " DHT target peer " + seeds[hc0].getName() + ":" + seeds[hc0].hash + ", distance = " + avdist);
hc0++;
@ -252,7 +252,7 @@ public final class plasmaWordIndexDistribution {
if (hc0 < peerCount) {
log.logWarning("found not enough (" + hc0 + ") peers for distribution");
closeTransferIndexes (indexEntities);
closeTransferIndexes(indexContainers);
return -1; // failed
}
@ -267,9 +267,9 @@ public final class plasmaWordIndexDistribution {
return -1; // interrupted
}
start = System.currentTimeMillis();
error = yacyClient.transferIndex(seeds[i], indexEntities, urlCache, this.gzipBody4Distribution, this.timeout4Distribution);
error = yacyClient.transferIndex(seeds[i], indexContainers, urlCache, this.gzipBody4Distribution, this.timeout4Distribution);
if (error == null) {
log.logInfo("Index transfer of " + indexCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length - 1].wordHash() + "] to peer " + seeds[i].getName() + ":" + seeds[i].hash + " in " + ((System.currentTimeMillis() - start) / 1000)
log.logInfo("Index transfer of " + indexCount + " words [" + indexContainers[0].wordHash() + " .. " + indexContainers[indexContainers.length - 1].wordHash() + "] to peer " + seeds[i].getName() + ":" + seeds[i].hash + " in " + ((System.currentTimeMillis() - start) / 1000)
+ " seconds successfull (" + (1000 * indexCount / (System.currentTimeMillis() - start + 1)) + " words/s)");
peerNames += ", " + seeds[i].getName();
hc1++;
@ -286,8 +286,8 @@ public final class plasmaWordIndexDistribution {
// success
if (delete) {
try {
if (deleteTransferIndexes(indexEntities)) {
log.logFine("Deleted all " + indexEntities.length + " transferred whole-word indexes locally");
if (deleteTransferIndexes(indexContainers)) {
log.logFine("Deleted all " + indexContainers.length + " transferred whole-word indexes locally");
return indexCount;
} else {
log.logSevere("Deleted not all transferred whole-word indexes");
@ -299,13 +299,13 @@ public final class plasmaWordIndexDistribution {
}
} else {
// simply close the indexEntities
closeTransferIndexes (indexEntities);
closeTransferIndexes(indexContainers);
}
return indexCount;
} else {
log.logSevere("Index distribution failed. Too few peers (" + hc1 + ") received the index, not deleted locally.");
// simply close the indexEntities
closeTransferIndexes (indexEntities);
closeTransferIndexes(indexContainers);
return -1;
}
}
@ -322,15 +322,16 @@ public final class plasmaWordIndexDistribution {
return startPointHash;
}
Object[] /* of {plasmaWordIndexEntity[], HashMap(String, plasmaCrawlLURL.Entry)}*/
selectTransferIndexes(String hash, int count, int maxOpenFiles) {
Object[] /* of {plasmaWordIndexEntryContainer[], HashMap(String, plasmaCrawlLURL.Entry)}*/
selectTransferContainers(String hash, int count, int maxOpenFiles) {
// the hash is a start hash from where the indexes are picked
ArrayList tmpEntities = new ArrayList(count);
ArrayList tmpContainers = new ArrayList(count);
String nexthash = "";
try {
int currOpenFiles = 0;
Iterator wordHashIterator = this.wordIndex.wordHashes(hash, true, true);
plasmaWordIndexEntity indexEntity, tmpEntity;
plasmaWordIndexEntity indexEntity;
plasmaWordIndexEntryContainer indexContainer;
Iterator urlIter;
Iterator hashIter;
plasmaWordIndexEntry indexEntry;
@ -343,56 +344,15 @@ public final class plasmaWordIndexDistribution {
(wordHashIterator.hasNext()) &&
((nexthash = (String) wordHashIterator.next()) != null) &&
(nexthash.trim().length() > 0) &&
((currOpenFiles == 0) || (yacyDHTAction.dhtDistance(nexthash,
((plasmaWordIndexEntity)tmpEntities.get(0)).wordHash()) < 0.2))
((currOpenFiles == 0) ||
(yacyDHTAction.dhtDistance(nexthash, ((plasmaWordIndexEntity)tmpContainers.get(0)).wordHash()) < 0.2))
) {
indexEntity = this.wordIndex.getEntity(nexthash, true, -1);
if (indexEntity.size() == 0) {
indexEntity.deleteComplete();
} else if ((indexEntity.size() <= count)|| // if we havn't exceeded the limit
(Math.abs(indexEntity.size() - count) <= 10)){ // or there are only at most 10 entries left
// take the whole entity
try {
// fist check if we know all urls
urlIter = indexEntity.elements(true);
unknownURLEntries.clear();
while (urlIter.hasNext()) {
indexEntry = (plasmaWordIndexEntry) urlIter.next();
try {
lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), indexEntry);
if ((lurl == null) || (lurl.url() == null)) {
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
knownURLs.put(indexEntry.getUrlHash(), lurl);
}
} catch (IOException e) {
unknownURLEntries.add(indexEntry.getUrlHash());
}
}
// now delete all entries that have no url entry
hashIter = unknownURLEntries.iterator();
while (hashIter.hasNext()) {
String nextUrlHash = (String) hashIter.next();
indexEntity.removeEntry(nextUrlHash, false);
this.urlPool.loadedURL.remove(nextUrlHash);
}
if (indexEntity.size() == 0) {
indexEntity.deleteComplete();
} else {
// use whats remaining
tmpEntities.add(indexEntity);
this.log.logFine("Selected whole index (" + indexEntity.size() + " URLs, " + unknownURLEntries.size() + " not bound) for word " + indexEntity.wordHash());
count -= indexEntity.size();
currOpenFiles++;
}
} catch (kelondroException e) {
this.log.logSevere("plasmaWordIndexDistribution/1: deleted DB for word " + indexEntity.wordHash(), e);
indexEntity.deleteComplete();
}
} else {
// make an on-the-fly entity and insert values
tmpEntity = new plasmaWordIndexEntity(indexEntity.wordHash());
indexContainer = new plasmaWordIndexEntryContainer(indexEntity.wordHash());
try {
urlIter = indexEntity.elements(true);
unknownURLEntries.clear();
@ -404,7 +364,7 @@ public final class plasmaWordIndexDistribution {
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
knownURLs.put(indexEntry.getUrlHash(), lurl);
tmpEntity.addEntry(indexEntry);
indexContainer.add(indexEntry);
count--;
}
} catch (IOException e) {
@ -426,8 +386,8 @@ public final class plasmaWordIndexDistribution {
}
// use whats remaining
this.log.logFine("Selected partial index (" + tmpEntity.size() + " from " + indexEntity.size() +" URLs, " + unknownURLEntries.size() + " not bound) for word " + tmpEntity.wordHash());
tmpEntities.add(tmpEntity);
this.log.logFine("Selected partial index (" + indexContainer.size() + " from " + indexEntity.size() +" URLs, " + unknownURLEntries.size() + " not bound) for word " + indexContainer.wordHash());
tmpContainers.add(indexContainer);
} catch (kelondroException e) {
this.log.logSevere("plasmaWordIndexDistribution/2: deleted DB for word " + indexEntity.wordHash(), e);
indexEntity.deleteComplete();
@ -438,8 +398,8 @@ public final class plasmaWordIndexDistribution {
}
// transfer to array
plasmaWordIndexEntity[] indexEntities = (plasmaWordIndexEntity[]) tmpEntities.toArray(new plasmaWordIndexEntity[tmpEntities.size()]);
return new Object[]{indexEntities, knownURLs, new Integer(currOpenFiles)};
plasmaWordIndexEntryContainer[] entryContainers = (plasmaWordIndexEntryContainer[]) tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]);
return new Object[]{entryContainers, knownURLs, new Integer(currOpenFiles)};
} catch (IOException e) {
this.log.logSevere("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage(), e);
return new Object[]{new plasmaWordIndexEntity[0], new HashMap(0)};
@ -477,6 +437,40 @@ public final class plasmaWordIndexDistribution {
} catch (IOException ee) {}
}
void closeTransferIndexes(plasmaWordIndexEntryContainer[] indexContainers) {
for (int i = 0; i < indexContainers.length; i++) {
indexContainers[i] = null;
}
}
boolean deleteTransferIndexes(plasmaWordIndexEntryContainer[] indexContainers) throws IOException {
Iterator urlIter;
plasmaWordIndexEntry indexEntry;
plasmaWordIndexEntity indexEntity;
String[] urlHashes;
int sz;
boolean success = true;
for (int i = 0; i < indexContainers.length; i++) {
// delete entries separately
int c = 0;
urlHashes = new String[indexContainers[i].size()];
urlIter = indexContainers[i].entries();
while (urlIter.hasNext()) {
indexEntry = (plasmaWordIndexEntry) urlIter.next();
urlHashes[c++] = indexEntry.getUrlHash();
}
wordIndex.removeEntries(indexContainers[i].wordHash(), urlHashes, true);
indexEntity = wordIndex.getEntity(indexContainers[i].wordHash(), true, -1);
sz = indexEntity.size();
// indexEntity.close();
closeTransferIndex(indexEntity);
log.logFine("Deleted partial index (" + c + " URLs) for word " + indexContainers[i].wordHash() + "; " + sz + " entries left");
indexContainers[i] = null;
}
return success;
}
/*
boolean deleteTransferIndexes(plasmaWordIndexEntity[] indexEntities) throws IOException {
Iterator urlIter;
plasmaWordIndexEntry indexEntry;
@ -500,13 +494,6 @@ public final class plasmaWordIndexDistribution {
// indexEntity.close();
closeTransferIndex(indexEntity);
log.logFine("Deleted partial index (" + c + " URLs) for word " + indexEntities[i].wordHash() + "; " + sz + " entries left");
// DEBUG: now try to delete the remaining index. If this works, this routine is fine
/*
if (wordIndex.getEntity(indexEntities[i].wordHash()).deleteComplete())
System.out.println("DEBUG: trial delete of partial word index " + indexEntities[i].wordHash() + " SUCCESSFULL");
else
System.out.println("DEBUG: trial delete of partial word index " + indexEntities[i].wordHash() + " FAILED");
*/
// end debug
indexEntities[i].close();
} else {
@ -516,7 +503,7 @@ public final class plasmaWordIndexDistribution {
} else {
indexEntities[i].close();
// have another try...
if (!(plasmaWordIndexEntity.wordHash2path(wordIndex.getRoot() /*PLASMADB*/, indexEntities[i].wordHash()).delete())) {
if (!(plasmaWordIndexEntity.wordHash2path(wordIndex.getRoot(), indexEntities[i].wordHash()).delete())) {
success = false;
log.logSevere("Could not delete whole index for word " + indexEntities[i].wordHash());
}
@ -526,7 +513,8 @@ public final class plasmaWordIndexDistribution {
}
return success;
}
*/
public void startTransferWholeIndex(yacySeed seed, boolean delete) {
if (transferIdxThread == null) {
this.transferIdxThread = new transferIndexThread(seed,delete);
@ -573,14 +561,14 @@ public final class plasmaWordIndexDistribution {
// word chunk
private String endPointHash;
private String startPointHash;
plasmaWordIndexEntity[] indexEntities;
plasmaWordIndexEntryContainer[] indexContainers;
// other fields
HashMap urlCache;
public transferIndexWorkerThread(
yacySeed seed,
plasmaWordIndexEntity[] indexEntities,
plasmaWordIndexEntryContainer[] indexContainers,
HashMap urlCache,
boolean gzipBody,
int timeout,
@ -594,7 +582,7 @@ public final class plasmaWordIndexDistribution {
this.timeout4Transfer = timeout;
this.iteration = iteration;
this.seed = seed;
this.indexEntities = indexEntities;
this.indexContainers = indexContainers;
this.urlCache = urlCache;
this.idxCount = idxCount;
this.chunkSize = chunkSize;
@ -657,11 +645,11 @@ public final class plasmaWordIndexDistribution {
// transfering seleted words to remote peer
this.status = "Running: Transfering chunk " + iteration;
String error = yacyClient.transferIndex(seed, indexEntities, urlCache, gzipBody4Transfer, timeout4Transfer);
String error = yacyClient.transferIndex(seed, indexContainers, urlCache, gzipBody4Transfer, timeout4Transfer);
if (error == null) {
// words successfully transfered
transferTime = System.currentTimeMillis() - start;
plasmaWordIndexDistribution.this.log.logInfo("Index transfer of " + idxCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length-1].wordHash() + "]" +
plasmaWordIndexDistribution.this.log.logInfo("Index transfer of " + idxCount + " words [" + indexContainers[0].wordHash() + " .. " + indexContainers[indexContainers.length-1].wordHash() + "]" +
" to peer " + seed.getName() + ":" + seed.hash + " in " + (transferTime/1000) + " seconds successfull (" +
(1000 * idxCount / (transferTime + 1)) + " words/s)");
retryCount = 0;
@ -817,7 +805,7 @@ public final class plasmaWordIndexDistribution {
}
public void performTransferWholeIndex() {
plasmaWordIndexEntity[] newIndexEntities = null, oldIndexEntities = null;
plasmaWordIndexEntryContainer[] newIndexContainers = null, oldIndexContainers = null;
try {
// pausing the regular index distribution
// TODO: adding sync, to wait for a still running index distribution to finish
@ -838,12 +826,12 @@ public final class plasmaWordIndexDistribution {
iteration++;
int idxCount = 0;
selectionStart = System.currentTimeMillis();
oldIndexEntities = newIndexEntities;
oldIndexContainers = newIndexContainers;
// selecting 500 words to transfer
this.status = "Running: Selecting chunk " + iteration;
Object[] selectResult = selectTransferIndexes(this.startPointHash, this.chunkSize, this.maxOpenFiles4Transfer - openedFiles.intValue());
newIndexEntities = (plasmaWordIndexEntity[]) selectResult[0];
Object[] selectResult = selectTransferContainers(this.startPointHash, this.chunkSize, this.maxOpenFiles4Transfer - openedFiles.intValue());
newIndexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0];
HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry
openedFiles = (Integer) selectResult[2];
@ -851,7 +839,7 @@ public final class plasmaWordIndexDistribution {
* a) no words are left in the index
* b) max open file limit was exceeded
*/
if ((newIndexEntities == null) || (newIndexEntities.length == 0)) {
if ((newIndexContainers == null) || (newIndexContainers.length == 0)) {
if (sb.wordIndex.size() > 0) {
// if there are still words in the index we try it again now
startPointHash = "------------";
@ -863,15 +851,15 @@ public final class plasmaWordIndexDistribution {
}
} else {
// count the indexes again, can be smaller as expected
for (int i = 0; i < newIndexEntities.length; i++) idxCount += newIndexEntities[i].size();
for (int i = 0; i < newIndexContainers.length; i++) idxCount += newIndexContainers[i].size();
// getting start point for next DHT-selection
oldStartingPointHash = startPointHash;
startPointHash = newIndexEntities[newIndexEntities.length - 1].wordHash(); // DHT targets must have greater hashes
startPointHash = newIndexContainers[newIndexContainers.length - 1].wordHash(); // DHT targets must have greater hashes
selectionEnd = System.currentTimeMillis();
selectionTime = selectionEnd - selectionStart;
plasmaWordIndexDistribution.this.log.logInfo("Index selection of " + idxCount + " words [" + newIndexEntities[0].wordHash() + " .. " + newIndexEntities[newIndexEntities.length-1].wordHash() + "]" +
plasmaWordIndexDistribution.this.log.logInfo("Index selection of " + idxCount + " words [" + newIndexContainers[0].wordHash() + " .. " + newIndexContainers[newIndexContainers.length-1].wordHash() + "]" +
" in " +
(selectionTime / 1000) + " seconds (" +
(1000 * idxCount / (selectionTime+1)) + " words/s)");
@ -886,10 +874,10 @@ public final class plasmaWordIndexDistribution {
this.status = "Aborted because of Transfer error:\n" + worker.getStatus();
// cleanup. closing all open files
closeEntities(oldIndexEntities);
oldIndexEntities = null;
closeEntities(newIndexEntities);
newIndexEntities = null;
closeContainers(oldIndexContainers);
oldIndexContainers = null;
closeContainers(newIndexContainers);
newIndexContainers = null;
// abort index transfer
return;
@ -922,10 +910,10 @@ public final class plasmaWordIndexDistribution {
if (delete) {
this.status = "Running: Deleting chunk " + iteration;
try {
if (deleteTransferIndexes(oldIndexEntities)) {
plasmaWordIndexDistribution.this.log.logFine("Deleted all " + oldIndexEntities.length + " transferred whole-word indexes locally");
if (deleteTransferIndexes(oldIndexContainers)) {
plasmaWordIndexDistribution.this.log.logFine("Deleted all " + oldIndexContainers.length + " transferred whole-word indexes locally");
transferedEntryCount += idxCount;
transferedEntityCount += oldIndexEntities.length;
transferedEntityCount += oldIndexContainers.length;
} else {
plasmaWordIndexDistribution.this.log.logSevere("Deleted not all transferred whole-word indexes");
}
@ -933,18 +921,18 @@ public final class plasmaWordIndexDistribution {
plasmaWordIndexDistribution.this.log.logSevere("Deletion of indexes not possible:" + ee.getMessage(), ee);
}
} else {
this.closeEntities(oldIndexEntities);
this.closeContainers(oldIndexContainers);
transferedEntryCount += idxCount;
transferedEntityCount += oldIndexEntities.length;
transferedEntityCount += oldIndexContainers.length;
}
oldIndexEntities = null;
oldIndexContainers = null;
}
this.worker = null;
}
// handover chunk to transfer worker
if (!((newIndexEntities == null) || (newIndexEntities.length == 0))) {
worker = new transferIndexWorkerThread(seed,newIndexEntities,urlCache,gzipBody4Transfer,timeout4Transfer,iteration,idxCount,idxCount,startPointHash,oldStartingPointHash);
if (!((newIndexContainers == null) || (newIndexContainers.length == 0))) {
worker = new transferIndexWorkerThread(seed,newIndexContainers,urlCache,gzipBody4Transfer,timeout4Transfer,iteration,idxCount,idxCount,startPointHash,oldStartingPointHash);
worker.start();
}
}
@ -961,30 +949,21 @@ public final class plasmaWordIndexDistribution {
try {worker.join();}catch(Exception e){}
// worker = null;
}
if (oldIndexEntities != null) closeEntities(oldIndexEntities);
if (newIndexEntities != null) closeEntities(newIndexEntities);
if (oldIndexContainers != null) closeContainers(oldIndexContainers);
if (newIndexContainers != null) closeContainers(newIndexContainers);
plasmaWordIndexDistribution.this.paused = false;
}
}
private void closeEntities(plasmaWordIndexEntity[] indexEntities) {
if ((indexEntities == null)||(indexEntities.length ==0)) return;
private void closeContainers(plasmaWordIndexEntryContainer[] indexContainers) {
if ((indexContainers == null)||(indexContainers.length ==0)) return;
for (int i = 0; i < indexEntities.length; i++) try {
indexEntities[i].close();
} catch (IOException ee) {}
}
/*
private boolean isAborted() {
if (finished || Thread.currentThread().isInterrupted()) {
this.status = "aborted";
return true;
}
return false;
for (int i = 0; i < indexContainers.length; i++) {
indexContainers[i] = null;
}
}
*/
}
}

@ -48,7 +48,6 @@ import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.Set;
import de.anomic.kelondro.kelondroRecords;
import de.anomic.kelondro.kelondroTree;
import de.anomic.kelondro.kelondroException;
@ -111,6 +110,7 @@ public final class plasmaWordIndexEntity {
hash.substring(4,6) + "/" + hash + ".db");
}
/*
public plasmaWordIndexEntity(String wordHash) {
// this creates a nameless temporary index. It is needed for combined search
// and used to hold the intersection of two indexes
@ -121,7 +121,7 @@ public final class plasmaWordIndexEntity {
theLocation = null;
theTmpMap = new TreeMap();
}
*/
public boolean isTMPEntity() {
return theTmpMap != null;
}
@ -302,12 +302,6 @@ public final class plasmaWordIndexEntity {
else return "EMPTY";
}
// join methods
private static int log2(int x) {
int l = 0;
while (x > 0) {x = x >> 1; l++;}
return l;
}
public void merge(plasmaWordIndexEntity otherEntity, long time) throws IOException {
// this is a merge of another entity to this entity
@ -324,6 +318,14 @@ public final class plasmaWordIndexEntity {
}
}
/*
// join methods
private static int log2(int x) {
int l = 0;
while (x > 0) {x = x >> 1; l++;}
return l;
}
public static plasmaWordIndexEntity joinEntities(Set entities, long time) throws IOException {
// big problem here: there cannot be a time-out for join, since a time-out will leave the joined set too big.
@ -485,5 +487,5 @@ public final class plasmaWordIndexEntity {
}
return conj;
}
*/
}

@ -100,18 +100,21 @@ public final class plasmaWordIndexEntry {
public static final char DT_UNKNOWN = 'u';
// appearance locations: (used for flags)
public static final int AP_TITLE = 0; // title tag from html header
public static final int AP_H1 = 1; // h1-tag
public static final int AP_H2 = 2; // h2-tag
public static final int AP_H3 = 3; // h3-tag
public static final int AP_H4 = 4; // h4-tag
public static final int AP_H5 = 5; // h5-tag
public static final int AP_H6 = 6; // h6-tag
public static final int AP_TEXT = 7; // word appears in text (used to check validation of other appearances against spam)
public static final int AP_URL = 8; // word inside an url
public static final int AP_IMG = 9; // tag inside image references
public static final int AP_TAG = 10; // for tagged indexeing (i.e. using mp3 tags)
public static final int AP_ANCHOR = 11; // anchor description
public static final int AP_TITLE = 0; // title tag from html header
public static final int AP_H1 = 1; // h1-tag
public static final int AP_H2 = 2; // h2-tag
public static final int AP_H3 = 3; // h3-tag
public static final int AP_H4 = 4; // h4-tag
public static final int AP_H5 = 5; // h5-tag
public static final int AP_H6 = 6; // h6-tag
public static final int AP_TEXT = 7; // word appears in text (used to check validation of other appearances against spam)
public static final int AP_URL = 8; // word inside an url
public static final int AP_IMG = 9; // tag inside image references
public static final int AP_TAG = 10; // for tagged indexeing (i.e. using mp3 tags)
public static final int AP_ANCHOR = 11; // anchor description
public static final int AP_BOLD = 12;
public static final int AP_ITALICS = 13;
public static final int AP_INVISIBLE = 14; // good for spam detection
// URL attributes
public static final int UA_LOCAL = 0; // URL was crawled locally
@ -208,6 +211,8 @@ public final class plasmaWordIndexEntry {
// the class instantiation can only be done by a plasmaStore method
// therefore they are all public
public plasmaWordIndexEntry(String urlHash,
int urlLength, // byte-length of complete URL
int urlComps, // number of path components
int hitcount, //*how often appears this word in the text
int wordcount, //*total number of words
int phrasecount, //*total number of phrases
@ -227,14 +232,9 @@ public final class plasmaWordIndexEntry {
// more needed attributes:
// - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag etc
// - boolean: URL attributes
// - int: url-length (shorter are better)
// - int: url-number of components / length of path
// - int: length of description tag / title tag (longer are better)
// - int: number of chapters
// - int: # of outlinks to same domain
// - int: # of outlinks to outside domain
// - int: length of description
// - int: length of title
// - int: # of keywords
if ((language == null) || (language.length() != plasmaURL.urlLanguageLength)) language = "uk";

@ -54,12 +54,14 @@ package de.anomic.plasma;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeMap;
import de.anomic.kelondro.kelondroBase64Order;
public final class plasmaWordIndexEntryContainer implements Comparable {
private final String wordHash;
private String wordHash;
private final HashMap container; // urlHash/plasmaWordIndexEntry - Mapping
private long updateTime;
@ -73,6 +75,15 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
container = new HashMap(initContainerSize); // a urlhash/plasmaWordIndexEntry - relation
}
public void setWordHash(String newWordHash) {
// this is used to replicate a container for different word indexes during global search
this.wordHash = newWordHash;
}
public void clear() {
container.clear();
}
public int size() {
return container.size();
}
@ -85,14 +96,18 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
return wordHash;
}
public int add(plasmaWordIndexEntry entry) {
return add(entry, System.currentTimeMillis());
}
public int add(plasmaWordIndexEntry entry, long updateTime) {
this.updateTime = java.lang.Math.max(this.updateTime, updateTime);
return (add(entry)) ? 1 : 0;
return (addi(entry)) ? 1 : 0;
}
public int add(plasmaWordIndexEntry[] entries, long updateTime) {
int c = 0;
for (int i = 0; i < entries.length; i++) if (add(entries[i])) c++;
for (int i = 0; i < entries.length; i++) if (addi(entries[i])) c++;
this.updateTime = java.lang.Math.max(this.updateTime, updateTime);
return c;
}
@ -102,13 +117,13 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
Iterator i = c.entries();
int x = 0;
while (i.hasNext()) {
if (add((plasmaWordIndexEntry) i.next())) x++;
if (addi((plasmaWordIndexEntry) i.next())) x++;
}
this.updateTime = java.lang.Math.max(this.updateTime, c.updateTime);
return x;
}
private boolean add(plasmaWordIndexEntry entry) {
private boolean addi(plasmaWordIndexEntry entry) {
// returns true if the new entry was added, false if it already existet
return (container.put(entry.getUrlHash(), entry) == null);
}
@ -117,10 +132,18 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
return container.containsKey(urlHash);
}
public plasmaWordIndexEntry get(String urlHash) {
return (plasmaWordIndexEntry) container.get(urlHash);
}
public plasmaWordIndexEntry[] getEntryArray() {
return (plasmaWordIndexEntry[]) container.values().toArray();
}
public plasmaWordIndexEntry remove(String urlHash) {
return (plasmaWordIndexEntry) container.remove(urlHash);
}
public Iterator entries() {
// returns an iterator of plasmaWordIndexEntry objects
return container.values().iterator();
@ -146,4 +169,126 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
return (int) kelondroBase64Order.enhancedCoder.decodeLong(this.wordHash.substring(0, 4));
}
public static plasmaWordIndexEntryContainer joinContainer(Set containers, long time, int maxDistance) {
long stamp = System.currentTimeMillis();
// order entities by their size
TreeMap map = new TreeMap();
plasmaWordIndexEntryContainer singleContainer;
Iterator i = containers.iterator();
int count = 0;
while (i.hasNext()) {
// get next entity:
singleContainer = (plasmaWordIndexEntryContainer) i.next();
// check result
if ((singleContainer == null) || (singleContainer.size() == 0)) return new plasmaWordIndexEntryContainer(null); // as this is a cunjunction of searches, we have no result if any word is not known
// store result in order of result size
map.put(new Long(singleContainer.size() * 1000 + count), singleContainer);
count++;
}
// check if there is any result
if (map.size() == 0) return new plasmaWordIndexEntryContainer(null); // no result, nothing found
// the map now holds the search results in order of number of hits per word
// we now must pairwise build up a conjunction of these sets
Long k = (Long) map.firstKey(); // the smallest, which means, the one with the least entries
plasmaWordIndexEntryContainer searchA, searchB, searchResult = (plasmaWordIndexEntryContainer) map.remove(k);
while ((map.size() > 0) && (searchResult.size() > 0)) {
// take the first element of map which is a result and combine it with result
k = (Long) map.firstKey(); // the next smallest...
time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis();
searchA = searchResult;
searchB = (plasmaWordIndexEntryContainer) map.remove(k);
searchResult = plasmaWordIndexEntryContainer.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1), maxDistance);
// free resources
searchA = null;
searchB = null;
}
// in 'searchResult' is now the combined search result
if (searchResult.size() == 0) return new plasmaWordIndexEntryContainer(null);
return searchResult;
}
// join methods
private static int log2(int x) {
int l = 0;
while (x > 0) {x = x >> 1; l++;}
return l;
}
public static plasmaWordIndexEntryContainer joinConstructive(plasmaWordIndexEntryContainer i1, plasmaWordIndexEntryContainer i2, long time, int maxDistance) {
if ((i1 == null) || (i2 == null)) return null;
if ((i1.size() == 0) || (i2.size() == 0)) return new plasmaWordIndexEntryContainer(null);
// decide which method to use
int high = ((i1.size() > i2.size()) ? i1.size() : i2.size());
int low = ((i1.size() > i2.size()) ? i2.size() : i1.size());
int stepsEnum = 10 * (high + low - 1);
int stepsTest = 12 * log2(high) * low;
// start most efficient method
if (stepsEnum > stepsTest) {
if (i1.size() < i2.size())
return joinConstructiveByTest(i1, i2, time, maxDistance);
else
return joinConstructiveByTest(i2, i1, time, maxDistance);
} else {
return joinConstructiveByEnumeration(i1, i2, time, maxDistance);
}
}
private static plasmaWordIndexEntryContainer joinConstructiveByTest(plasmaWordIndexEntryContainer small, plasmaWordIndexEntryContainer large, long time, int maxDistance) {
System.out.println("DEBUG: JOIN METHOD BY TEST");
plasmaWordIndexEntryContainer conj = new plasmaWordIndexEntryContainer(null); // start with empty search result
Iterator se = small.entries();
plasmaWordIndexEntry ie0, ie1;
long stamp = System.currentTimeMillis();
while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) {
ie0 = (plasmaWordIndexEntry) se.next();
ie1 = large.get(ie0.getUrlHash());
if (ie1 != null) {
// this is a hit. Calculate word distance:
ie0.combineDistance(ie1);
if (ie0.worddistance() <= maxDistance) conj.add(ie0);
}
}
return conj;
}
private static plasmaWordIndexEntryContainer joinConstructiveByEnumeration(plasmaWordIndexEntryContainer i1, plasmaWordIndexEntryContainer i2, long time, int maxDistance) {
System.out.println("DEBUG: JOIN METHOD BY ENUMERATION");
plasmaWordIndexEntryContainer conj = new plasmaWordIndexEntryContainer(null); // start with empty search result
Iterator e1 = i1.entries();
Iterator e2 = i2.entries();
int c;
if ((e1.hasNext()) && (e2.hasNext())) {
plasmaWordIndexEntry ie1;
plasmaWordIndexEntry ie2;
ie1 = (plasmaWordIndexEntry) e1.next();
ie2 = (plasmaWordIndexEntry) e2.next();
long stamp = System.currentTimeMillis();
while ((System.currentTimeMillis() - stamp) < time) {
c = ie1.getUrlHash().compareTo(ie2.getUrlHash());
if (c < 0) {
if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break;
} else if (c > 0) {
if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break;
} else {
// we have found the same urls in different searches!
ie1.combineDistance(ie2);
if (ie1.worddistance() <= maxDistance) conj.add(ie1);
if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break;
if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break;
}
}
}
return conj;
}
}

@ -50,7 +50,8 @@ public interface plasmaWordIndexInterface {
public Iterator wordHashes(String startWordHash, boolean up);
public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty, long maxTime);
public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime);
public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime);
public long getUpdateTime(String wordHash);
public void deleteIndex(String wordHash);

@ -50,12 +50,13 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpc;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.plasma.plasmaURLPattern;
@ -348,14 +349,15 @@ public final class yacyClient {
}
public static int search(
String wordhashes,
String wordhashes,
int maxDistance,
boolean global,
yacySeed targetPeer,
plasmaCrawlLURL urlManager,
plasmaWordIndexEntity entityCache,
plasmaURLPattern blacklist,
plasmaSnippetCache snippets,
plasmaSearchProfile profile
plasmaCrawlLURL urlManager,
plasmaWordIndexEntryContainer containerCache,
plasmaURLPattern blacklist,
plasmaSnippetCache snippets,
plasmaSearchProfile profile
) {
// send a search request to peer with remote Hash
// this mainly converts the words into word hashes
@ -403,6 +405,7 @@ public final class yacyClient {
obj.put("ttl", "0");
obj.put("duetime", Long.toString(duetime));
obj.put("profile", profile.targetToString()); // new duetimes splitted by specific search tasks
obj.put("maxdist", maxDistance);
obj.put(yacySeed.MYTIME, yacyCore.universalDateShortString(new Date()));
//yacyCore.log.logDebug("yacyClient.search url=" + url);
@ -460,6 +463,9 @@ public final class yacyClient {
// get one single search result
urlEntry = urlManager.newEntry((String) result.get("resource" + n), true);
if (urlEntry != null && blacklist.isListed(urlEntry.url().getHost().toLowerCase(), urlEntry.url().getPath())) { continue; } // block with backlist
int urlLength = urlEntry.url().toString().length();
int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length;
urlManager.addEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
// save the url entry
final plasmaWordIndexEntry entry;
@ -467,6 +473,7 @@ public final class yacyClient {
// the old way to define words
entry = new plasmaWordIndexEntry(
urlEntry.hash(),
urlLength, urlComps,
urlEntry.wordCount(),
0, 0, 0, 0, 0, 0,
urlEntry.size(),
@ -494,7 +501,7 @@ public final class yacyClient {
}
// finally insert the containers to the index
for (int m = 0; m < words; m++) { entityCache.addEntries(container[m]); }
for (int m = 0; m < words; m++) { containerCache.add(container[m]); }
// generate statistics
long searchtime;
@ -841,7 +848,7 @@ public final class yacyClient {
httpHeader requestHeader) throws IOException {
*/
public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntity[] indexes, HashMap urlCache, boolean gzipBody, int timeout) {
public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntryContainer[] indexes, HashMap urlCache, boolean gzipBody, int timeout) {
HashMap in = transferRWI(targetSeed, indexes, gzipBody, timeout);
if (in == null) { return "no_connection_1"; }
@ -875,7 +882,7 @@ public final class yacyClient {
return null;
}
private static HashMap transferRWI(yacySeed targetSeed, plasmaWordIndexEntity[] indexes, boolean gzipBody, int timeout) {
private static HashMap transferRWI(yacySeed targetSeed, plasmaWordIndexEntryContainer[] indexes, boolean gzipBody, int timeout) {
final String address = targetSeed.getAddress();
if (address == null) { return null; }
@ -903,7 +910,7 @@ public final class yacyClient {
Iterator eenum;
plasmaWordIndexEntry entry;
for (int i = 0; i < indexes.length; i++) {
eenum = indexes[i].elements(true);
eenum = indexes[i].entries();
while (eenum.hasNext()) {
entry = (plasmaWordIndexEntry) eenum.next();
entrypost.append(indexes[i].wordHash())

@ -52,8 +52,8 @@ import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaURLPattern;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaSearchProfile;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.server.logging.serverLog;
public class yacySearch extends Thread {
@ -61,29 +61,31 @@ public class yacySearch extends Thread {
final private Set wordhashes;
final private boolean global;
final private plasmaCrawlLURL urlManager;
final private plasmaWordIndexEntity entityCache;
final private plasmaWordIndexEntryContainer containerCache;
final private plasmaURLPattern blacklist;
final private plasmaSnippetCache snippetCache;
final private yacySeed targetPeer;
private int links;
private int maxDistance;
final private plasmaSearchProfile profile;
public yacySearch(Set wordhashes, boolean global, yacySeed targetPeer,
plasmaCrawlLURL urlManager, plasmaWordIndexEntity entityCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchProfile profile) {
public yacySearch(Set wordhashes, int maxDistance, boolean global, yacySeed targetPeer,
plasmaCrawlLURL urlManager, plasmaWordIndexEntryContainer containerCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchProfile profile) {
super("yacySearch_" + targetPeer.getName());
this.wordhashes = wordhashes;
this.global = global;
this.urlManager = urlManager;
this.entityCache = entityCache;
this.containerCache = containerCache;
this.blacklist = blacklist;
this.snippetCache = snippetCache;
this.targetPeer = targetPeer;
this.links = -1;
this.maxDistance = maxDistance;
this.profile = (plasmaSearchProfile) profile.clone();
}
public void run() {
this.links = yacyClient.search(set2string(wordhashes), global, targetPeer, urlManager, entityCache, blacklist, snippetCache, profile);
this.links = yacyClient.search(set2string(wordhashes), maxDistance, global, targetPeer, urlManager, containerCache, blacklist, snippetCache, profile);
if (links != 0) {
//yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + links + " links for word hash " + wordhashes);
yacyCore.seedDB.mySeed.incRI(links);
@ -172,7 +174,7 @@ public class yacySearch extends Thread {
return result;
}
public static yacySearch[] searchHashes(Set wordhashes, plasmaCrawlLURL urlManager, plasmaWordIndexEntity entityCache,
public static yacySearch[] searchHashes(Set wordhashes, int maxDist, plasmaCrawlLURL urlManager, plasmaWordIndexEntryContainer containerCache,
int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchProfile profile) {
// check own peer status
if (yacyCore.seedDB.mySeed == null || yacyCore.seedDB.mySeed.getAddress() == null) { return null; }
@ -185,8 +187,8 @@ public class yacySearch extends Thread {
if (targets == 0) return null;
yacySearch[] searchThreads = new yacySearch[targets];
for (int i = 0; i < targets; i++) {
searchThreads[i]= new yacySearch(wordhashes, true, targetPeers[i],
urlManager, entityCache, blacklist, snippetCache, profile);
searchThreads[i]= new yacySearch(wordhashes, maxDist, true, targetPeers[i],
urlManager, containerCache, blacklist, snippetCache, profile);
searchThreads[i].start();
try {Thread.sleep(20);} catch (InterruptedException e) {}
@ -216,5 +218,4 @@ public class yacySearch extends Thread {
}
}
}

Loading…
Cancel
Save