changes towards the new index storage scheme:

- replaced usage of temporary IndexEntity by EntryContainer
- added more attributes to word index
- added exact-string search (using quotes in query)
- disabled writing into WORDS during search; EntryContainers are used instead


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1485 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent c81ad1bf34
commit 03c65742ba

@ -62,6 +62,7 @@ import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyClient;
@ -255,12 +256,12 @@ public class IndexControl_p {
}
prop.put("urlstring", "");
prop.put("urlhash", "");
plasmaWordIndexEntity[] indexes = new plasmaWordIndexEntity[1];
plasmaWordIndexEntryContainer[] indexes = new plasmaWordIndexEntryContainer[1];
String result;
long starttime = System.currentTimeMillis();
indexes[0] = switchboard.wordIndex.getEntity(keyhash, true, -1);
indexes[0] = switchboard.wordIndex.getContainer(keyhash, true, -1);
// built urlCache
Iterator urlIter = indexes[0].elements(true);
Iterator urlIter = indexes[0].entries();
HashMap knownURLs = new HashMap();
HashSet unknownURLEntries = new HashSet();
plasmaWordIndexEntry indexEntry;
@ -282,9 +283,7 @@ public class IndexControl_p {
// now delete all entries that have no url entry
Iterator hashIter = unknownURLEntries.iterator();
while (hashIter.hasNext()) {
try {
indexes[0].removeEntry((String) hashIter.next(), false);
} catch (IOException e) {}
indexes[0].remove((String) hashIter.next());
}
// use whats remaining
String gzipBody = switchboard.getConfig("indexControl.gzipBody","false");
@ -296,7 +295,8 @@ public class IndexControl_p {
"true".equalsIgnoreCase(gzipBody),
timeout);
prop.put("result", (result == null) ? ("Successfully transferred " + indexes[0].size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result);
try {indexes[0].close();} catch (IOException e) {}
indexes[0] = null;
indexes = null;
}
// generate list
@ -431,15 +431,15 @@ public class IndexControl_p {
public static String genUrlList(plasmaSwitchboard switchboard, String keyhash, String keystring) {
// search for a word hash and generate a list of url links
plasmaWordIndexEntity index = null;
plasmaWordIndexEntryContainer index = null;
try {
index = switchboard.wordIndex.getEntity(keyhash, true, -1);
index = switchboard.wordIndex.getContainer(keyhash, true, -1);
final StringBuffer result = new StringBuffer(1024);
if (index.size() == 0) {
result.append("No URL entries related to this word hash <span class=\"tt\">").append(keyhash).append("</span>.");
} else {
final Iterator en = index.elements(true);
final Iterator en = index.entries();
result.append("URL entries related to this word hash <span class=\"tt\">").append(keyhash).append("</span><br><br>");
result.append("<form action=\"IndexControl_p.html\" method=\"post\" enctype=\"multipart/form-data\">");
String us;
@ -497,13 +497,12 @@ public class IndexControl_p {
.append("<span class=\"small\">for every resolveable and deleted URL reference, delete the same reference at every other word where the reference exists (very extensive, but prevents further unresolved references)</span>")
.append("</td></tr></table></fieldset></form><br>");
}
index.close();
index = null;
return result.toString();
} catch (IOException e) {
return "";
} finally {
if (index != null) try { index.close(); index = null; } catch (Exception e) {};
if (index != null) index = null;
}
}

@ -463,7 +463,7 @@ public class dir {
"AAAAAAAAAAAA", /*referrer*/
0, /*copycount*/
false, /*localneed*/
condenser.RESULT_INFORMATION_VALUE,
condenser.RESULT_WORD_ENTROPHY,
"**", /*language*/
plasmaWordIndexEntry.DT_SHARE, /*doctype*/
phrase.length(), /*size*/

@ -126,7 +126,12 @@ public class index {
// SEARCH
// process search words
final String querystring = post.get("search", "");
int maxDistance = Integer.MAX_VALUE;
String querystring = post.get("search", "").trim();
if ((querystring.charAt(0) == '"') && (querystring.charAt(querystring.length() - 1) == '"')) {
querystring = querystring.substring(1, querystring.length() - 1).trim();
maxDistance = 1;
}
if (sb.facilityDB != null) try { sb.facilityDB.update("zeitgeist", querystring, post); } catch (Exception e) {}
final TreeSet query = plasmaSearchQuery.cleanQuery(querystring);
// filter out stopwords
@ -172,7 +177,7 @@ public class index {
}
// do the search
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, new String[]{order1, order2, order3}, count, searchtime, urlmask, referer,
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, maxDistance, new String[]{order1, order2, order3}, count, searchtime, urlmask, referer,
((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL,
"", 20);
final serverObjects prop = sb.searchFromLocal(thisSearch);

@ -47,7 +47,6 @@
// javac -classpath .:../../Classes search.java
// if the shell's current path is htroot/yacy
import java.io.IOException;
import java.util.HashSet;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlLURL;
@ -81,6 +80,7 @@ public final class search {
// final String fwden = post.get("fwden", ""); // forward deny, a list of seed hashes. They may NOT be target of forward hopping
final long duetime= post.getLong("duetime", 3000);
final int count = post.getInt("count", 10); // maximum number of wanted results
final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE);
// final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers
// Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time
@ -103,7 +103,7 @@ public final class search {
}
final long timestamp = System.currentTimeMillis();
plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, new String[]{plasmaSearchQuery.ORDER_YBR, plasmaSearchQuery.ORDER_DATE, plasmaSearchQuery.ORDER_QUALITY},
plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, maxdist, new String[]{plasmaSearchQuery.ORDER_YBR, plasmaSearchQuery.ORDER_DATE, plasmaSearchQuery.ORDER_QUALITY},
count, duetime, ".*");
squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
@ -114,11 +114,8 @@ public final class search {
plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache);
plasmaSearchResult acc = null;
int idxc = 0;
try {
idxc = theSearch.localSearch();
acc = theSearch.order();
} catch (IOException e) {
}
// result is a List of urlEntry elements
if ((idxc == 0) || (acc == null)) {

@ -75,6 +75,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
linkTags0.add("img");
linkTags0.add("base");
linkTags0.add("frame");
linkTags0.add("meta");
linkTags1 = new TreeSet(insensitiveCollator);
linkTags1.add("a");
@ -88,6 +89,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
// class variables: collectors for links
private HashMap anchors;
private HashMap images;
private HashMap metas;
private String title;
//private String headline;
private List[] headlines;
@ -101,6 +103,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
this.root = root;
this.anchors = new HashMap();
this.images = new HashMap();
this.metas = new HashMap();
this.title = "";
this.headlines = new ArrayList[4];
for (int i = 0; i < 4; i++) headlines[i] = new ArrayList();
@ -194,6 +197,11 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
}
}
public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
public static String[] urlComps(String normalizedURL) {
return normalizedURL.toLowerCase().split(splitrex); // word components of the url
}
private String absolutePath(String relativePath) {
try {
return urlNormalform(new URL(root, relativePath));
@ -206,6 +214,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
if (tagname.equalsIgnoreCase("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt",""));
if (tagname.equalsIgnoreCase("base")) try {root = new URL(tagopts.getProperty("href", ""));} catch (MalformedURLException e) {}
if (tagname.equalsIgnoreCase("frame")) anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name",""));
if (tagname.equalsIgnoreCase("meta")) metas.put((tagopts.getProperty("name", "")).toLowerCase(), tagopts.getProperty("content",""));
}
public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
@ -252,10 +261,16 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
// construct a title string, even if the document has no title
// if there is one, return it
if (title.length() > 0) return title;
// othervise take any headline
for (int i = 0; i < 4; i++) {
if (headlines[i].size() > 0) return (String) headlines[i].get(0);
}
// take description tag
String s = getDescription();
if (s.length() > 0) return s;
// extract headline from content
if (content.length() > 80) return cleanLine(new String(content.getBytes(), 0, 80));
return cleanLine(content.trim().toString());
@ -280,6 +295,45 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return images;
}
public Map getMetas() {
return metas;
}
public String getDescription() {
String s = (String) metas.get("description");
if (s == null) return ""; else return s;
}
public String getContentType() {
String s = (String) metas.get("content-type");
if (s == null) return ""; else return s;
}
public String getCopyright() {
String s = (String) metas.get("copyright");
if (s == null) return ""; else return s;
}
public String[] getContentLanguages() {
String s = (String) metas.get("content-language");
if (s == null) s = "";
return s.split(" |,");
}
public String[] getKeywords() {
String s = (String) metas.get("keywords");
if (s == null) s = "";
if (s.length() == 0) {
return getTitle().toLowerCase().split(splitrex);
} else {
return s.split(" |,");
}
}
/*
* (non-Javadoc)
* @see de.anomic.htmlFilter.htmlFilterScraper#close()
*/
public void close() {
// free resources
super.close();
@ -298,6 +352,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
}
System.out.println("ANCHORS :" + anchors.toString());
System.out.println("IMAGES :" + images.toString());
System.out.println("METAS :" + metas.toString());
System.out.println("TEXT :" + new String(content.getBytes()));
}

@ -850,11 +850,11 @@ public class kelondroTree extends kelondroRecords implements kelondroIndex {
this.rot = rotating;
ii = new nodeIterator(asc, rot, start);
nextNode = (ii.hasNext()) ? (Node) ii.next() : null;
if (nextNode != null) {
if ((nextNode != null) && (nextNode.getKey() != null)) {
int c = objectOrder.compare(firstKey, nextNode.getKey());
if ((c > 0) && (asc)) {
// firstKey > nextNode.getKey()
log.logWarning("CORRECTING ITERATOR: firstKey=" + new String(firstKey) + ", nextNode=" + new String(nextNode.getKey()));
if (log != null) log.logWarning("CORRECTING ITERATOR: firstKey=" + new String(firstKey) + ", nextNode=" + new String(nextNode.getKey()));
nextNode = (ii.hasNext()) ? (Node) ii.next() : null;
}
if ((c < 0) && (!(asc))) {

@ -83,9 +83,6 @@ public final class plasmaCondenser {
public int RESULT_NUMB_SENTENCES = -1;
public int RESULT_DIFF_SENTENCES = -1;
public int RESULT_SIMI_SENTENCES = -1;
public int RESULT_AVERAGE_WORD_OCC = -1;
public int RESULT_INFORMATION_VALUE = -1;
public plasmaCondenser(InputStream text) {
this(text, 3, 2);
@ -357,8 +354,7 @@ public final class plasmaCondenser {
this.RESULT_NUMB_SENTENCES = allsentencecounter;
this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
this.RESULT_SIMI_SENTENCES = sentences.size();
this.RESULT_AVERAGE_WORD_OCC = (words.size() == 0) ? 0 : (allwordcounter / words.size());
this.RESULT_INFORMATION_VALUE = (allwordcounter == 0) ? 0 : (wordenum.count() * words.size() / allwordcounter / 16);
//this.RESULT_INFORMATION_VALUE = (allwordcounter == 0) ? 0 : (wordenum.count() * words.size() / allwordcounter / 16);
}
public void print() {

@ -176,7 +176,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
}
public synchronized Entry getEntry(String hash, plasmaWordIndexEntry searchedWord) throws IOException {
public Entry getEntry(String hash, plasmaWordIndexEntry searchedWord) throws IOException {
return new Entry(hash, searchedWord);
}
@ -399,7 +399,15 @@ public final class plasmaCrawlLURL extends plasmaURL {
private int size;
private int wordCount;
private String snippet;
private plasmaWordIndexEntry word;
private plasmaWordIndexEntry word; // this is only used if the url is transported via remote search requests
// more needed attributes:
// - author / copyright owner
// - keywords
// - phrasecount, total number of phrases
// - boolean: URL attributes
// - int: # of outlinks to same domain
// - int: # of outlinks to outside domain
public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) {
// create new entry and store it into database

@ -63,7 +63,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
private plasmaWordIndex wordIndex;
private plasmaCrawlLURL urlStore;
private plasmaSnippetCache snippetCache;
private plasmaWordIndexEntity rcLocal, rcGlobal; // caches for results
private plasmaWordIndexEntryContainer rcLocal, rcGlobal; // caches for results
private plasmaSearchProfile profileLocal, profileGlobal;
private yacySearch[] searchThreads;
@ -73,8 +73,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
this.query = query;
this.urlStore = urlStore;
this.snippetCache = snippetCache;
this.rcLocal = new plasmaWordIndexEntity(null);
this.rcGlobal = new plasmaWordIndexEntity(null);
this.rcLocal = new plasmaWordIndexEntryContainer(null);
this.rcGlobal = new plasmaWordIndexEntryContainer(null);
if (query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) {
this.profileLocal = new plasmaSearchProfile(4 * query.maximumTime / 10, query.wantedResults);
this.profileGlobal = new plasmaSearchProfile(6 * query.maximumTime / 10, query.wantedResults);
@ -114,7 +114,6 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
int globalContributions = globalSearch(fetchpeers);
log.logFine("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
try {
// combine the result and order
plasmaSearchResult result = order();
result.globalContributions = globalContributions;
@ -125,57 +124,46 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
//serverInstantThread.oneTimeJob(this, "flushResults", log, 0);
// clean up
if ((rcLocal != null) && (!(rcLocal.isTMPEntity()))) rcLocal.close();
rcLocal = null;
// return search result
log.logFine("SEARCHRESULT: " + profileLocal.reportToString());
lastEvent = this;
return result;
} catch (IOException e) {
return null;
}
} else {
// do a local search
//long start = System.currentTimeMillis();
try {
localSearch();
plasmaSearchResult result = order();
result.localContributions = rcLocal.size();
// clean up
if ((rcLocal != null) && (!(rcLocal.isTMPEntity()))) rcLocal.close();
rcLocal = null;
// return search result
log.logFine("SEARCHRESULT: " + profileLocal.reportToString());
lastEvent = this;
return result;
} catch (IOException e) {
return null;
}
}
}
public int localSearch() throws IOException {
public int localSearch() {
// search for the set of hashes and return an array of urlEntry elements
// retrieve entities that belong to the hashes
profileLocal.startTimer();
Set entities = wordIndex.getEntities(query.queryHashes, true, true, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_COLLECTION));
if (entities.size() < query.size()) entities = null; // prevent that only a subset is returned
Set containers = wordIndex.getContainers(query.queryHashes, true, true, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_COLLECTION));
if (containers.size() < query.size()) containers = null; // prevent that only a subset is returned
profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_COLLECTION);
profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_COLLECTION, (entities == null) ? 0 : entities.size());
profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_COLLECTION, (containers == null) ? 0 : containers.size());
// since this is a conjunction we return an empty entity if any word is not known
if (entities == null) {
rcLocal = new plasmaWordIndexEntity(null);
if (containers == null) {
rcLocal = new plasmaWordIndexEntryContainer(null);
return 0;
}
// join the result
profileLocal.startTimer();
rcLocal = plasmaWordIndexEntity.joinEntities(entities, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_JOIN));
rcLocal = plasmaWordIndexEntryContainer.joinContainer(containers, profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_JOIN), query.maxDistance);
profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_JOIN);
profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_JOIN, rcLocal.size());
@ -190,7 +178,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
log.logFine("STARTING " + fetchpeers + " THREADS TO CATCH EACH " + profileGlobal.getTargetCount(plasmaSearchProfile.PROCESS_POSTSORT) + " URLs WITHIN " + (profileGlobal.duetime() / 1000) + " SECONDS");
long timeout = System.currentTimeMillis() + profileGlobal.duetime() + 4000;
searchThreads = yacySearch.searchHashes(query.queryHashes, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal);
searchThreads = yacySearch.searchHashes(query.queryHashes, query.maxDistance, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal);
// wait until wanted delay passed or wanted result appeared
while (System.currentTimeMillis() < timeout) {
@ -204,20 +192,20 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
return rcGlobal.size();
}
public plasmaSearchResult order() throws IOException {
public plasmaSearchResult order() {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
plasmaWordIndexEntity searchResult = new plasmaWordIndexEntity(null);
searchResult.merge(rcLocal, -1);
searchResult.merge(rcGlobal, -1);
plasmaWordIndexEntryContainer searchResult = new plasmaWordIndexEntryContainer(null);
searchResult.add(rcLocal);
searchResult.add(rcGlobal);
long preorderTime = profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_PRESORT);
long postorderTime = profileLocal.getTargetTime(plasmaSearchProfile.PROCESS_POSTSORT);
profileLocal.startTimer();
plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query);
preorder.addEntity(searchResult, preorderTime);
preorder.addContainer(searchResult, preorderTime);
profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_PRESORT);
profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_PRESORT, rcLocal.size());
@ -289,19 +277,13 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
Iterator hashi = query.queryHashes.iterator();
while (hashi.hasNext()) {
wordHash = (String) hashi.next();
Iterator i = rcGlobal.elements(true);
plasmaWordIndexEntry entry;
plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash, rcGlobal.size());
while (i.hasNext()) {
entry = (plasmaWordIndexEntry) i.next();
container.add(entry, System.currentTimeMillis());
}
wordIndex.addEntries(container, true);
log.logFine("FLUSHED " + wordHash + ": " + container.size() + " url entries");
rcGlobal.setWordHash(wordHash);
wordIndex.addEntries(rcGlobal, true);
log.logFine("FLUSHED " + wordHash + ": " + rcGlobal.size() + " url entries");
}
// the rcGlobal was flushed, empty it
count += rcGlobal.size();
rcGlobal.deleteComplete();
rcGlobal.clear();
}
// wait a little bit before trying again
try {Thread.sleep(3000);} catch (InterruptedException e) {}

@ -116,8 +116,8 @@ public final class plasmaSearchPreOrder {
return (plasmaWordIndexEntry) pageAcc.remove(top);
}
public void addEntity(plasmaWordIndexEntity entity, long maxTime) {
Iterator i = entity.elements(true);
public void addContainer(plasmaWordIndexEntryContainer container, long maxTime) {
Iterator i = container.entries();
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
plasmaWordIndexEntry entry;
while (i.hasNext()) {

@ -72,12 +72,14 @@ public final class plasmaSearchQuery {
public int domType;
public String domGroupName;
public int domMaxTargets;
public int maxDistance;
public plasmaSearchQuery(Set queryWords,
public plasmaSearchQuery(Set queryWords, int maxDistance,
String[] order, int wantedResults, long maximumTime, String urlMask,
String referrer,
int domType, String domGroupName, int domMaxTargets) {
this.queryWords = queryWords;
this.maxDistance = maxDistance;
this.queryHashes = words2hashes(queryWords);
this.order = order;
this.wantedResults = wantedResults;
@ -89,9 +91,10 @@ public final class plasmaSearchQuery {
this.domMaxTargets = domMaxTargets;
}
public plasmaSearchQuery(Set queryHashes,
public plasmaSearchQuery(Set queryHashes, int maxDistance,
String[] order, int wantedResults, long maximumTime, String urlMask) {
this.queryWords = null;
this.maxDistance = maxDistance;
this.queryHashes = queryHashes;
this.order = order;
this.wantedResults = wantedResults;

@ -54,11 +54,10 @@ import java.net.MalformedURLException;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.serverCodings;
import de.anomic.htmlFilter.htmlFilterContentScraper;
public final class plasmaSearchResult {
public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic
private ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects
@ -111,8 +110,8 @@ public final class plasmaSearchResult {
URL url = page.url();
String descr = page.descr();
if ((url == null) || (descr == null)) return;
String[] urlcomps = url.toString().toLowerCase().split(splitrex); // word components of the url
String[] descrcomps = descr.toLowerCase().split(splitrex); // words in the description
String[] urlcomps = htmlFilterContentScraper.urlComps(url.toString()); // word components of the url
String[] descrcomps = descr.toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description
// store everything
Object[] resultVector = new Object[] {indexEntry, page, urlcomps, descrcomps};

@ -1285,7 +1285,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
yacyCore.seedDB.mySeed.hash,
referrerHash,
0, true,
condenser.RESULT_INFORMATION_VALUE,
condenser.RESULT_WORD_ENTROPHY,
plasmaWordIndexEntry.language(entry.url()),
plasmaWordIndexEntry.docType(document.getMimeType()),
(int) entry.size(),
@ -1313,15 +1313,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} else {
HashMap urlCache = new HashMap(1);
urlCache.put(newEntry.hash(),newEntry);
ArrayList tmpEntities = new ArrayList(condenser.RESULT_SIMI_WORDS);
ArrayList tmpContainers = new ArrayList(condenser.RESULT_SIMI_WORDS);
String language = plasmaWordIndexEntry.language(entry.url());
char doctype = plasmaWordIndexEntry.docType(document.getMimeType());
int quality = 0;
try {
quality = condenser.RESULT_INFORMATION_VALUE;
} catch (NumberFormatException e) {
System.out.println("INTERNAL ERROR WITH CONDENSER.INFORMATION_VALUE: " + e.toString() + ": in URL " + newEntry.url().toString());
}
int urlLength = newEntry.url().toString().length();
int urlComps = htmlFilterContentScraper.urlComps(newEntry.url().toString()).length;
// iterate over all words
Iterator i = condenser.words();
@ -1332,8 +1328,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String word = (String) wentry.getKey();
wordStat = (plasmaCondenser.wordStatProp) wentry.getValue();
String wordHash = plasmaWordIndexEntry.word2hash(word);
plasmaWordIndexEntity wordIdxEntity = new plasmaWordIndexEntity(wordHash);
plasmaWordIndexEntryContainer wordIdxContainer = new plasmaWordIndexEntryContainer(wordHash);
plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash,
urlLength, urlComps,
wordStat.count,
condenser.RESULT_SIMI_WORDS,
condenser.RESULT_SIMI_SENTENCES,
@ -1344,26 +1341,25 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
newEntry.size(),
docDate.getTime(),
System.currentTimeMillis(),
quality, language, doctype, true);
wordIdxEntity.addEntry(wordIdxEntry);
tmpEntities.add(wordIdxEntity);
condenser.RESULT_WORD_ENTROPHY,
language,
doctype,
true);
wordIdxContainer.add(wordIdxEntry);
tmpContainers.add(wordIdxContainer);
// wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry));
}
//System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries");
words = condenser.RESULT_SIMI_WORDS;
// transfering the index to the storage peer
String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntity[])tmpEntities.toArray(new plasmaWordIndexEntity[tmpEntities.size()]),urlCache,true,120000);
String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntryContainer[])tmpContainers.toArray(new plasmaWordIndexEntity[tmpContainers.size()]),urlCache,true,120000);
if (error != null) {
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()));
}
// cleanup
for (int j=0; j < tmpEntities.size(); j++) {
plasmaWordIndexEntity tmpEntity = (plasmaWordIndexEntity) tmpEntities.get(j);
try { tmpEntity.close(); } catch (Exception e) {}
}
tmpContainers = null;
}
storageEndTime = System.currentTimeMillis();

@ -56,6 +56,7 @@ import java.util.Set;
import java.util.Date;
import java.net.URL;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.server.logging.serverLog;
@ -136,15 +137,7 @@ public final class plasmaWordIndex {
public int addPageIndex(URL url, String urlHash, Date urlModified, int size, plasmaCondenser condenser, String language, char doctype) {
// this is called by the switchboard to put in a new page into the index
// use all the words in one condenser object to simultanous create index
// entries
// int age = microDateDays(urlModified);
int quality = 0;
try {
quality = condenser.RESULT_INFORMATION_VALUE;
} catch (NumberFormatException e) {
System.out.println("INTERNAL ERROR WITH CONDENSER.INFORMATION_VALUE: " + e.toString() + ": in URL " + url.toString());
}
// use all the words in one condenser object to simultanous create index entries
// iterate over all words
Iterator i = condenser.words();
@ -153,6 +146,9 @@ public final class plasmaWordIndex {
plasmaWordIndexEntry ientry;
plasmaCondenser.wordStatProp wprop;
String wordHash;
int urlLength = url.toString().length();
int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length;
while (i.hasNext()) {
wentry = (Map.Entry) i.next();
word = (String) wentry.getKey();
@ -160,6 +156,7 @@ public final class plasmaWordIndex {
// if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
wordHash = plasmaWordIndexEntry.word2hash(word);
ientry = new plasmaWordIndexEntry(urlHash,
urlLength, urlComps,
wprop.count,
condenser.RESULT_SIMI_WORDS,
condenser.RESULT_SIMI_SENTENCES,
@ -170,7 +167,10 @@ public final class plasmaWordIndex {
size,
urlModified.getTime(),
System.currentTimeMillis(),
quality, language, doctype, true);
condenser.RESULT_WORD_ENTROPHY,
language,
doctype,
true);
addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), ientry), false);
}
// System.out.println("DEBUG: plasmaSearch.addPageIndex: added " +
@ -178,10 +178,43 @@ public final class plasmaWordIndex {
return condenser.RESULT_SIMI_WORDS;
}
public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
return ramCache.getContainer(wordHash, deleteIfEmpty, maxTime);
}
public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) {
return ramCache.getIndex(wordHash, deleteIfEmpty, maxTime);
return ramCache.getEntity(wordHash, deleteIfEmpty, maxTime);
}
public Set getContainers(Set wordHashes, boolean deleteIfEmpty, boolean interruptIfEmpty, long maxTime) {
// retrieve entities that belong to the hashes
HashSet containers = new HashSet();
String singleHash;
plasmaWordIndexEntryContainer singleContainer;
Iterator i = wordHashes.iterator();
long start = System.currentTimeMillis();
long remaining;
while (i.hasNext()) {
// check time
remaining = maxTime - (System.currentTimeMillis() - start);
//if ((maxTime > 0) && (remaining <= 0)) break;
// get next hash:
singleHash = (String) i.next();
// retrieve index
singleContainer = getContainer(singleHash, deleteIfEmpty, (maxTime < 0) ? -1 : remaining / (wordHashes.size() - containers.size()));
// check result
if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashSet();
containers.add(singleContainer);
}
return containers;
}
/*
public Set getEntities(Set wordHashes, boolean deleteIfEmpty, boolean interruptIfEmpty, long maxTime) {
// retrieve entities that belong to the hashes
@ -203,12 +236,13 @@ public final class plasmaWordIndex {
singleEntity = getEntity(singleHash, deleteIfEmpty, (maxTime < 0) ? -1 : remaining / (wordHashes.size() - entities.size()));
// check result
if (((singleEntity == null) || (singleEntity.size() == 0)) && (interruptIfEmpty)) return null;
if (((singleEntity == null) || (singleEntity.size() == 0)) && (interruptIfEmpty)) return new HashSet();
entities.add(singleEntity);
}
return entities;
}
*/
public int size() {
return ramCache.size();

@ -203,7 +203,7 @@ public final class plasmaWordIndexAssortmentCluster {
}
public plasmaWordIndexEntryContainer removeFromAll(String wordHash, long maxTime) {
// collect all records from all the assortments and return them
// removes all records from all the assortments and return them
plasmaWordIndexEntryContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash);
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
for (int i = 0; i < clusterCount; i++) {
@ -214,6 +214,18 @@ public final class plasmaWordIndexAssortmentCluster {
return record;
}
public plasmaWordIndexEntryContainer getFromAll(String wordHash, long maxTime) {
// collect all records from all the assortments and return them
plasmaWordIndexEntryContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash);
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
for (int i = 0; i < clusterCount; i++) {
buffer = assortments[i].get(wordHash);
if (buffer != null) record.add(buffer);
if (System.currentTimeMillis() > limitTime) break;
}
return record;
}
public Iterator hashConjunction(String startWordHash, boolean up, boolean rot) {
HashSet iterators = new HashSet();
//if (rot) System.out.println("WARNING: kelondroMergeIterator does not work correctly when individual iterators rotate on their own!");

@ -391,7 +391,18 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
}
}
public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty, long maxTime) {
public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
long start = System.currentTimeMillis();
if (maxTime > 0) maxTime = 8 * maxTime / 10; // reserve time for later adding to backend
plasmaWordIndexEntryContainer container = assortmentCluster.getFromAll(wordHash, maxTime);
if (container == null) {
container = new plasmaWordIndexEntryContainer(wordHash);
}
container.add(backend.getContainer(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : System.currentTimeMillis() - start));
return container;
}
public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) {
// this possibly creates an index file in the back-end
// the index file is opened and returned as entity object
long start = System.currentTimeMillis();
@ -406,7 +417,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
}
}
long r = maxTime - (System.currentTimeMillis() - start);
return backend.getIndex(wordHash, deleteIfEmpty, (r < 0) ? 0 : r);
return backend.getEntity(wordHash, deleteIfEmpty, (r < 0) ? 0 : r);
}
public long getUpdateTime(String wordHash) {

@ -181,7 +181,24 @@ public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface {
}
}
public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty, long maxTime) {
public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
long start = System.currentTimeMillis();
if (plasmaWordIndexEntity.wordHash2path(databaseRoot, wordHash).exists()) {
plasmaWordIndexEntity entity = this.getEntity(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime * 9 / 10);
plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash);
plasmaWordIndexEntry entry;
Iterator i = entity.elements(true);
while ((i.hasNext()) && ((maxTime < 0) || (System.currentTimeMillis() < start + maxTime))) {
entry = (plasmaWordIndexEntry) i.next();
container.add(entry);
}
return container;
} else {
return new plasmaWordIndexEntryContainer(wordHash, 0);
}
}
public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) {
return new plasmaWordIndexEntity(databaseRoot, wordHash, deleteIfEmpty);
}
@ -190,7 +207,6 @@ public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface {
if (f.exists()) return f.lastModified(); else return -1;
}
public void deleteIndex(String wordHash) {
plasmaWordIndexEntity.removePlasmaIndex(databaseRoot, wordHash);
}
@ -200,7 +216,7 @@ public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface {
plasmaWordIndexEntity pi = null;
int count = 0;
try {
pi = getIndex(wordHash, true, -1);
pi = getEntity(wordHash, true, -1);
for (int i = 0; i < urlHashes.length; i++)
if (pi.removeEntry(urlHashes[i], deleteComplete)) count++;
int size = pi.size();

@ -201,33 +201,33 @@ public final class plasmaWordIndexDistribution {
// collect index
String startPointHash = selectTransferStart();
log.logFine("Selected hash " + startPointHash + " as start point for index distribution, distance = " + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, startPointHash));
Object[] selectResult = selectTransferIndexes(startPointHash, indexCount, this.maxOpenFiles4Distribution);
plasmaWordIndexEntity[] indexEntities = (plasmaWordIndexEntity[]) selectResult[0];
Object[] selectResult = selectTransferContainers(startPointHash, indexCount, this.maxOpenFiles4Distribution);
plasmaWordIndexEntryContainer[] indexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0];
//Integer openedFiles = (Integer) selectResult[2];
HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry
if ((indexEntities == null) || (indexEntities.length == 0)) {
if ((indexContainers == null) || (indexContainers.length == 0)) {
log.logFine("No index available for index transfer, hash start-point " + startPointHash);
return -1;
}
// count the indexes again, can be smaller as expected
indexCount = 0;
for (int i = 0; i < indexEntities.length; i++) {
indexCount += indexEntities[i].size();
for (int i = 0; i < indexContainers.length; i++) {
indexCount += indexContainers[i].size();
}
if (indexCount < 50) {
log.logFine("Too few (" + indexCount + ") indexes selected for transfer.");
closeTransferIndexes (indexEntities);
closeTransferIndexes(indexContainers);
return -1; // failed
}
// find start point for DHT-selection
String keyhash = indexEntities[indexEntities.length - 1].wordHash(); // DHT targets must have greater hashes
String keyhash = indexContainers[indexContainers.length - 1].wordHash(); // DHT targets must have greater hashes
// find a list of DHT-peers
yacySeed[] seeds = new yacySeed[peerCount + 10];
int hc0 = 0;
double ownDistance = Math.min(yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexEntities[0].wordHash()),
yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexEntities[indexEntities.length - 1].wordHash()));
double ownDistance = Math.min(yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexContainers[0].wordHash()),
yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, indexContainers[indexContainers.length - 1].wordHash()));
double maxDistance = Math.min(ownDistance, 0.4);
synchronized (yacyCore.dhtAgent) {
double avdist;
@ -239,8 +239,8 @@ public final class plasmaWordIndexDistribution {
}
seeds[hc0] = (yacySeed) e.nextElement();
if (seeds[hc0] != null) {
avdist = Math.max(yacyDHTAction.dhtDistance(seeds[hc0].hash, indexEntities[0].wordHash()),
yacyDHTAction.dhtDistance(seeds[hc0].hash, indexEntities[indexEntities.length - 1].wordHash()));
avdist = Math.max(yacyDHTAction.dhtDistance(seeds[hc0].hash, indexContainers[0].wordHash()),
yacyDHTAction.dhtDistance(seeds[hc0].hash, indexContainers[indexContainers.length - 1].wordHash()));
if (avdist < maxDistance) {
log.logInfo("Selected " + ((hc0 < peerCount) ? "primary" : "reserve") + " DHT target peer " + seeds[hc0].getName() + ":" + seeds[hc0].hash + ", distance = " + avdist);
hc0++;
@ -252,7 +252,7 @@ public final class plasmaWordIndexDistribution {
if (hc0 < peerCount) {
log.logWarning("found not enough (" + hc0 + ") peers for distribution");
closeTransferIndexes (indexEntities);
closeTransferIndexes(indexContainers);
return -1; // failed
}
@ -267,9 +267,9 @@ public final class plasmaWordIndexDistribution {
return -1; // interrupted
}
start = System.currentTimeMillis();
error = yacyClient.transferIndex(seeds[i], indexEntities, urlCache, this.gzipBody4Distribution, this.timeout4Distribution);
error = yacyClient.transferIndex(seeds[i], indexContainers, urlCache, this.gzipBody4Distribution, this.timeout4Distribution);
if (error == null) {
log.logInfo("Index transfer of " + indexCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length - 1].wordHash() + "] to peer " + seeds[i].getName() + ":" + seeds[i].hash + " in " + ((System.currentTimeMillis() - start) / 1000)
log.logInfo("Index transfer of " + indexCount + " words [" + indexContainers[0].wordHash() + " .. " + indexContainers[indexContainers.length - 1].wordHash() + "] to peer " + seeds[i].getName() + ":" + seeds[i].hash + " in " + ((System.currentTimeMillis() - start) / 1000)
+ " seconds successfull (" + (1000 * indexCount / (System.currentTimeMillis() - start + 1)) + " words/s)");
peerNames += ", " + seeds[i].getName();
hc1++;
@ -286,8 +286,8 @@ public final class plasmaWordIndexDistribution {
// success
if (delete) {
try {
if (deleteTransferIndexes(indexEntities)) {
log.logFine("Deleted all " + indexEntities.length + " transferred whole-word indexes locally");
if (deleteTransferIndexes(indexContainers)) {
log.logFine("Deleted all " + indexContainers.length + " transferred whole-word indexes locally");
return indexCount;
} else {
log.logSevere("Deleted not all transferred whole-word indexes");
@ -299,13 +299,13 @@ public final class plasmaWordIndexDistribution {
}
} else {
// simply close the indexEntities
closeTransferIndexes (indexEntities);
closeTransferIndexes(indexContainers);
}
return indexCount;
} else {
log.logSevere("Index distribution failed. Too few peers (" + hc1 + ") received the index, not deleted locally.");
// simply close the indexEntities
closeTransferIndexes (indexEntities);
closeTransferIndexes(indexContainers);
return -1;
}
}
@ -322,15 +322,16 @@ public final class plasmaWordIndexDistribution {
return startPointHash;
}
Object[] /* of {plasmaWordIndexEntity[], HashMap(String, plasmaCrawlLURL.Entry)}*/
selectTransferIndexes(String hash, int count, int maxOpenFiles) {
Object[] /* of {plasmaWordIndexEntryContainer[], HashMap(String, plasmaCrawlLURL.Entry)}*/
selectTransferContainers(String hash, int count, int maxOpenFiles) {
// the hash is a start hash from where the indexes are picked
ArrayList tmpEntities = new ArrayList(count);
ArrayList tmpContainers = new ArrayList(count);
String nexthash = "";
try {
int currOpenFiles = 0;
Iterator wordHashIterator = this.wordIndex.wordHashes(hash, true, true);
plasmaWordIndexEntity indexEntity, tmpEntity;
plasmaWordIndexEntity indexEntity;
plasmaWordIndexEntryContainer indexContainer;
Iterator urlIter;
Iterator hashIter;
plasmaWordIndexEntry indexEntry;
@ -343,56 +344,15 @@ public final class plasmaWordIndexDistribution {
(wordHashIterator.hasNext()) &&
((nexthash = (String) wordHashIterator.next()) != null) &&
(nexthash.trim().length() > 0) &&
((currOpenFiles == 0) || (yacyDHTAction.dhtDistance(nexthash,
((plasmaWordIndexEntity)tmpEntities.get(0)).wordHash()) < 0.2))
((currOpenFiles == 0) ||
(yacyDHTAction.dhtDistance(nexthash, ((plasmaWordIndexEntity)tmpContainers.get(0)).wordHash()) < 0.2))
) {
indexEntity = this.wordIndex.getEntity(nexthash, true, -1);
if (indexEntity.size() == 0) {
indexEntity.deleteComplete();
} else if ((indexEntity.size() <= count)|| // if we havn't exceeded the limit
(Math.abs(indexEntity.size() - count) <= 10)){ // or there are only at most 10 entries left
// take the whole entity
try {
// fist check if we know all urls
urlIter = indexEntity.elements(true);
unknownURLEntries.clear();
while (urlIter.hasNext()) {
indexEntry = (plasmaWordIndexEntry) urlIter.next();
try {
lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), indexEntry);
if ((lurl == null) || (lurl.url() == null)) {
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
knownURLs.put(indexEntry.getUrlHash(), lurl);
}
} catch (IOException e) {
unknownURLEntries.add(indexEntry.getUrlHash());
}
}
// now delete all entries that have no url entry
hashIter = unknownURLEntries.iterator();
while (hashIter.hasNext()) {
String nextUrlHash = (String) hashIter.next();
indexEntity.removeEntry(nextUrlHash, false);
this.urlPool.loadedURL.remove(nextUrlHash);
}
if (indexEntity.size() == 0) {
indexEntity.deleteComplete();
} else {
// use whats remaining
tmpEntities.add(indexEntity);
this.log.logFine("Selected whole index (" + indexEntity.size() + " URLs, " + unknownURLEntries.size() + " not bound) for word " + indexEntity.wordHash());
count -= indexEntity.size();
currOpenFiles++;
}
} catch (kelondroException e) {
this.log.logSevere("plasmaWordIndexDistribution/1: deleted DB for word " + indexEntity.wordHash(), e);
indexEntity.deleteComplete();
}
} else {
// make an on-the-fly entity and insert values
tmpEntity = new plasmaWordIndexEntity(indexEntity.wordHash());
indexContainer = new plasmaWordIndexEntryContainer(indexEntity.wordHash());
try {
urlIter = indexEntity.elements(true);
unknownURLEntries.clear();
@ -404,7 +364,7 @@ public final class plasmaWordIndexDistribution {
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
knownURLs.put(indexEntry.getUrlHash(), lurl);
tmpEntity.addEntry(indexEntry);
indexContainer.add(indexEntry);
count--;
}
} catch (IOException e) {
@ -426,8 +386,8 @@ public final class plasmaWordIndexDistribution {
}
// use whats remaining
this.log.logFine("Selected partial index (" + tmpEntity.size() + " from " + indexEntity.size() +" URLs, " + unknownURLEntries.size() + " not bound) for word " + tmpEntity.wordHash());
tmpEntities.add(tmpEntity);
this.log.logFine("Selected partial index (" + indexContainer.size() + " from " + indexEntity.size() +" URLs, " + unknownURLEntries.size() + " not bound) for word " + indexContainer.wordHash());
tmpContainers.add(indexContainer);
} catch (kelondroException e) {
this.log.logSevere("plasmaWordIndexDistribution/2: deleted DB for word " + indexEntity.wordHash(), e);
indexEntity.deleteComplete();
@ -438,8 +398,8 @@ public final class plasmaWordIndexDistribution {
}
// transfer to array
plasmaWordIndexEntity[] indexEntities = (plasmaWordIndexEntity[]) tmpEntities.toArray(new plasmaWordIndexEntity[tmpEntities.size()]);
return new Object[]{indexEntities, knownURLs, new Integer(currOpenFiles)};
plasmaWordIndexEntryContainer[] entryContainers = (plasmaWordIndexEntryContainer[]) tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]);
return new Object[]{entryContainers, knownURLs, new Integer(currOpenFiles)};
} catch (IOException e) {
this.log.logSevere("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage(), e);
return new Object[]{new plasmaWordIndexEntity[0], new HashMap(0)};
@ -477,6 +437,40 @@ public final class plasmaWordIndexDistribution {
} catch (IOException ee) {}
}
void closeTransferIndexes(plasmaWordIndexEntryContainer[] indexContainers) {
for (int i = 0; i < indexContainers.length; i++) {
indexContainers[i] = null;
}
}
boolean deleteTransferIndexes(plasmaWordIndexEntryContainer[] indexContainers) throws IOException {
Iterator urlIter;
plasmaWordIndexEntry indexEntry;
plasmaWordIndexEntity indexEntity;
String[] urlHashes;
int sz;
boolean success = true;
for (int i = 0; i < indexContainers.length; i++) {
// delete entries separately
int c = 0;
urlHashes = new String[indexContainers[i].size()];
urlIter = indexContainers[i].entries();
while (urlIter.hasNext()) {
indexEntry = (plasmaWordIndexEntry) urlIter.next();
urlHashes[c++] = indexEntry.getUrlHash();
}
wordIndex.removeEntries(indexContainers[i].wordHash(), urlHashes, true);
indexEntity = wordIndex.getEntity(indexContainers[i].wordHash(), true, -1);
sz = indexEntity.size();
// indexEntity.close();
closeTransferIndex(indexEntity);
log.logFine("Deleted partial index (" + c + " URLs) for word " + indexContainers[i].wordHash() + "; " + sz + " entries left");
indexContainers[i] = null;
}
return success;
}
/*
boolean deleteTransferIndexes(plasmaWordIndexEntity[] indexEntities) throws IOException {
Iterator urlIter;
plasmaWordIndexEntry indexEntry;
@ -500,13 +494,6 @@ public final class plasmaWordIndexDistribution {
// indexEntity.close();
closeTransferIndex(indexEntity);
log.logFine("Deleted partial index (" + c + " URLs) for word " + indexEntities[i].wordHash() + "; " + sz + " entries left");
// DEBUG: now try to delete the remaining index. If this works, this routine is fine
/*
if (wordIndex.getEntity(indexEntities[i].wordHash()).deleteComplete())
System.out.println("DEBUG: trial delete of partial word index " + indexEntities[i].wordHash() + " SUCCESSFULL");
else
System.out.println("DEBUG: trial delete of partial word index " + indexEntities[i].wordHash() + " FAILED");
*/
// end debug
indexEntities[i].close();
} else {
@ -516,7 +503,7 @@ public final class plasmaWordIndexDistribution {
} else {
indexEntities[i].close();
// have another try...
if (!(plasmaWordIndexEntity.wordHash2path(wordIndex.getRoot() /*PLASMADB*/, indexEntities[i].wordHash()).delete())) {
if (!(plasmaWordIndexEntity.wordHash2path(wordIndex.getRoot(), indexEntities[i].wordHash()).delete())) {
success = false;
log.logSevere("Could not delete whole index for word " + indexEntities[i].wordHash());
}
@ -526,6 +513,7 @@ public final class plasmaWordIndexDistribution {
}
return success;
}
*/
public void startTransferWholeIndex(yacySeed seed, boolean delete) {
if (transferIdxThread == null) {
@ -573,14 +561,14 @@ public final class plasmaWordIndexDistribution {
// word chunk
private String endPointHash;
private String startPointHash;
plasmaWordIndexEntity[] indexEntities;
plasmaWordIndexEntryContainer[] indexContainers;
// other fields
HashMap urlCache;
public transferIndexWorkerThread(
yacySeed seed,
plasmaWordIndexEntity[] indexEntities,
plasmaWordIndexEntryContainer[] indexContainers,
HashMap urlCache,
boolean gzipBody,
int timeout,
@ -594,7 +582,7 @@ public final class plasmaWordIndexDistribution {
this.timeout4Transfer = timeout;
this.iteration = iteration;
this.seed = seed;
this.indexEntities = indexEntities;
this.indexContainers = indexContainers;
this.urlCache = urlCache;
this.idxCount = idxCount;
this.chunkSize = chunkSize;
@ -657,11 +645,11 @@ public final class plasmaWordIndexDistribution {
// transfering seleted words to remote peer
this.status = "Running: Transfering chunk " + iteration;
String error = yacyClient.transferIndex(seed, indexEntities, urlCache, gzipBody4Transfer, timeout4Transfer);
String error = yacyClient.transferIndex(seed, indexContainers, urlCache, gzipBody4Transfer, timeout4Transfer);
if (error == null) {
// words successfully transfered
transferTime = System.currentTimeMillis() - start;
plasmaWordIndexDistribution.this.log.logInfo("Index transfer of " + idxCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length-1].wordHash() + "]" +
plasmaWordIndexDistribution.this.log.logInfo("Index transfer of " + idxCount + " words [" + indexContainers[0].wordHash() + " .. " + indexContainers[indexContainers.length-1].wordHash() + "]" +
" to peer " + seed.getName() + ":" + seed.hash + " in " + (transferTime/1000) + " seconds successfull (" +
(1000 * idxCount / (transferTime + 1)) + " words/s)");
retryCount = 0;
@ -817,7 +805,7 @@ public final class plasmaWordIndexDistribution {
}
public void performTransferWholeIndex() {
plasmaWordIndexEntity[] newIndexEntities = null, oldIndexEntities = null;
plasmaWordIndexEntryContainer[] newIndexContainers = null, oldIndexContainers = null;
try {
// pausing the regular index distribution
// TODO: adding sync, to wait for a still running index distribution to finish
@ -838,12 +826,12 @@ public final class plasmaWordIndexDistribution {
iteration++;
int idxCount = 0;
selectionStart = System.currentTimeMillis();
oldIndexEntities = newIndexEntities;
oldIndexContainers = newIndexContainers;
// selecting 500 words to transfer
this.status = "Running: Selecting chunk " + iteration;
Object[] selectResult = selectTransferIndexes(this.startPointHash, this.chunkSize, this.maxOpenFiles4Transfer - openedFiles.intValue());
newIndexEntities = (plasmaWordIndexEntity[]) selectResult[0];
Object[] selectResult = selectTransferContainers(this.startPointHash, this.chunkSize, this.maxOpenFiles4Transfer - openedFiles.intValue());
newIndexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0];
HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry
openedFiles = (Integer) selectResult[2];
@ -851,7 +839,7 @@ public final class plasmaWordIndexDistribution {
* a) no words are left in the index
* b) max open file limit was exceeded
*/
if ((newIndexEntities == null) || (newIndexEntities.length == 0)) {
if ((newIndexContainers == null) || (newIndexContainers.length == 0)) {
if (sb.wordIndex.size() > 0) {
// if there are still words in the index we try it again now
startPointHash = "------------";
@ -863,15 +851,15 @@ public final class plasmaWordIndexDistribution {
}
} else {
// count the indexes again, can be smaller as expected
for (int i = 0; i < newIndexEntities.length; i++) idxCount += newIndexEntities[i].size();
for (int i = 0; i < newIndexContainers.length; i++) idxCount += newIndexContainers[i].size();
// getting start point for next DHT-selection
oldStartingPointHash = startPointHash;
startPointHash = newIndexEntities[newIndexEntities.length - 1].wordHash(); // DHT targets must have greater hashes
startPointHash = newIndexContainers[newIndexContainers.length - 1].wordHash(); // DHT targets must have greater hashes
selectionEnd = System.currentTimeMillis();
selectionTime = selectionEnd - selectionStart;
plasmaWordIndexDistribution.this.log.logInfo("Index selection of " + idxCount + " words [" + newIndexEntities[0].wordHash() + " .. " + newIndexEntities[newIndexEntities.length-1].wordHash() + "]" +
plasmaWordIndexDistribution.this.log.logInfo("Index selection of " + idxCount + " words [" + newIndexContainers[0].wordHash() + " .. " + newIndexContainers[newIndexContainers.length-1].wordHash() + "]" +
" in " +
(selectionTime / 1000) + " seconds (" +
(1000 * idxCount / (selectionTime+1)) + " words/s)");
@ -886,10 +874,10 @@ public final class plasmaWordIndexDistribution {
this.status = "Aborted because of Transfer error:\n" + worker.getStatus();
// cleanup. closing all open files
closeEntities(oldIndexEntities);
oldIndexEntities = null;
closeEntities(newIndexEntities);
newIndexEntities = null;
closeContainers(oldIndexContainers);
oldIndexContainers = null;
closeContainers(newIndexContainers);
newIndexContainers = null;
// abort index transfer
return;
@ -922,10 +910,10 @@ public final class plasmaWordIndexDistribution {
if (delete) {
this.status = "Running: Deleting chunk " + iteration;
try {
if (deleteTransferIndexes(oldIndexEntities)) {
plasmaWordIndexDistribution.this.log.logFine("Deleted all " + oldIndexEntities.length + " transferred whole-word indexes locally");
if (deleteTransferIndexes(oldIndexContainers)) {
plasmaWordIndexDistribution.this.log.logFine("Deleted all " + oldIndexContainers.length + " transferred whole-word indexes locally");
transferedEntryCount += idxCount;
transferedEntityCount += oldIndexEntities.length;
transferedEntityCount += oldIndexContainers.length;
} else {
plasmaWordIndexDistribution.this.log.logSevere("Deleted not all transferred whole-word indexes");
}
@ -933,18 +921,18 @@ public final class plasmaWordIndexDistribution {
plasmaWordIndexDistribution.this.log.logSevere("Deletion of indexes not possible:" + ee.getMessage(), ee);
}
} else {
this.closeEntities(oldIndexEntities);
this.closeContainers(oldIndexContainers);
transferedEntryCount += idxCount;
transferedEntityCount += oldIndexEntities.length;
transferedEntityCount += oldIndexContainers.length;
}
oldIndexEntities = null;
oldIndexContainers = null;
}
this.worker = null;
}
// handover chunk to transfer worker
if (!((newIndexEntities == null) || (newIndexEntities.length == 0))) {
worker = new transferIndexWorkerThread(seed,newIndexEntities,urlCache,gzipBody4Transfer,timeout4Transfer,iteration,idxCount,idxCount,startPointHash,oldStartingPointHash);
if (!((newIndexContainers == null) || (newIndexContainers.length == 0))) {
worker = new transferIndexWorkerThread(seed,newIndexContainers,urlCache,gzipBody4Transfer,timeout4Transfer,iteration,idxCount,idxCount,startPointHash,oldStartingPointHash);
worker.start();
}
}
@ -961,30 +949,21 @@ public final class plasmaWordIndexDistribution {
try {worker.join();}catch(Exception e){}
// worker = null;
}
if (oldIndexEntities != null) closeEntities(oldIndexEntities);
if (newIndexEntities != null) closeEntities(newIndexEntities);
if (oldIndexContainers != null) closeContainers(oldIndexContainers);
if (newIndexContainers != null) closeContainers(newIndexContainers);
plasmaWordIndexDistribution.this.paused = false;
}
}
private void closeEntities(plasmaWordIndexEntity[] indexEntities) {
if ((indexEntities == null)||(indexEntities.length ==0)) return;
private void closeContainers(plasmaWordIndexEntryContainer[] indexContainers) {
if ((indexContainers == null)||(indexContainers.length ==0)) return;
for (int i = 0; i < indexEntities.length; i++) try {
indexEntities[i].close();
} catch (IOException ee) {}
for (int i = 0; i < indexContainers.length; i++) {
indexContainers[i] = null;
}
/*
private boolean isAborted() {
if (finished || Thread.currentThread().isInterrupted()) {
this.status = "aborted";
return true;
}
return false;
}
*/
}
}

@ -48,7 +48,6 @@ import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.Set;
import de.anomic.kelondro.kelondroRecords;
import de.anomic.kelondro.kelondroTree;
import de.anomic.kelondro.kelondroException;
@ -111,6 +110,7 @@ public final class plasmaWordIndexEntity {
hash.substring(4,6) + "/" + hash + ".db");
}
/*
public plasmaWordIndexEntity(String wordHash) {
// this creates a nameless temporary index. It is needed for combined search
// and used to hold the intersection of two indexes
@ -121,7 +121,7 @@ public final class plasmaWordIndexEntity {
theLocation = null;
theTmpMap = new TreeMap();
}
*/
public boolean isTMPEntity() {
return theTmpMap != null;
}
@ -302,12 +302,6 @@ public final class plasmaWordIndexEntity {
else return "EMPTY";
}
// join methods
private static int log2(int x) {
int l = 0;
while (x > 0) {x = x >> 1; l++;}
return l;
}
public void merge(plasmaWordIndexEntity otherEntity, long time) throws IOException {
// this is a merge of another entity to this entity
@ -324,6 +318,14 @@ public final class plasmaWordIndexEntity {
}
}
/*
// join methods
private static int log2(int x) {
int l = 0;
while (x > 0) {x = x >> 1; l++;}
return l;
}
public static plasmaWordIndexEntity joinEntities(Set entities, long time) throws IOException {
// big problem here: there cannot be a time-out for join, since a time-out will leave the joined set too big.
@ -485,5 +487,5 @@ public final class plasmaWordIndexEntity {
}
return conj;
}
*/
}

@ -112,6 +112,9 @@ public final class plasmaWordIndexEntry {
public static final int AP_IMG = 9; // tag inside image references
public static final int AP_TAG = 10; // for tagged indexeing (i.e. using mp3 tags)
public static final int AP_ANCHOR = 11; // anchor description
public static final int AP_BOLD = 12;
public static final int AP_ITALICS = 13;
public static final int AP_INVISIBLE = 14; // good for spam detection
// URL attributes
public static final int UA_LOCAL = 0; // URL was crawled locally
@ -208,6 +211,8 @@ public final class plasmaWordIndexEntry {
// the class instantiation can only be done by a plasmaStore method
// therefore they are all public
public plasmaWordIndexEntry(String urlHash,
int urlLength, // byte-length of complete URL
int urlComps, // number of path components
int hitcount, //*how often appears this word in the text
int wordcount, //*total number of words
int phrasecount, //*total number of phrases
@ -227,14 +232,9 @@ public final class plasmaWordIndexEntry {
// more needed attributes:
// - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag etc
// - boolean: URL attributes
// - int: url-length (shorter are better)
// - int: url-number of components / length of path
// - int: length of description tag / title tag (longer are better)
// - int: number of chapters
// - int: # of outlinks to same domain
// - int: # of outlinks to outside domain
// - int: length of description
// - int: length of title
// - int: # of keywords
if ((language == null) || (language.length() != plasmaURL.urlLanguageLength)) language = "uk";

@ -54,12 +54,14 @@ package de.anomic.plasma;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeMap;
import de.anomic.kelondro.kelondroBase64Order;
public final class plasmaWordIndexEntryContainer implements Comparable {
private final String wordHash;
private String wordHash;
private final HashMap container; // urlHash/plasmaWordIndexEntry - Mapping
private long updateTime;
@ -73,6 +75,15 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
container = new HashMap(initContainerSize); // a urlhash/plasmaWordIndexEntry - relation
}
public void setWordHash(String newWordHash) {
// this is used to replicate a container for different word indexes during global search
this.wordHash = newWordHash;
}
public void clear() {
container.clear();
}
public int size() {
return container.size();
}
@ -85,14 +96,18 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
return wordHash;
}
public int add(plasmaWordIndexEntry entry) {
return add(entry, System.currentTimeMillis());
}
public int add(plasmaWordIndexEntry entry, long updateTime) {
this.updateTime = java.lang.Math.max(this.updateTime, updateTime);
return (add(entry)) ? 1 : 0;
return (addi(entry)) ? 1 : 0;
}
public int add(plasmaWordIndexEntry[] entries, long updateTime) {
int c = 0;
for (int i = 0; i < entries.length; i++) if (add(entries[i])) c++;
for (int i = 0; i < entries.length; i++) if (addi(entries[i])) c++;
this.updateTime = java.lang.Math.max(this.updateTime, updateTime);
return c;
}
@ -102,13 +117,13 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
Iterator i = c.entries();
int x = 0;
while (i.hasNext()) {
if (add((plasmaWordIndexEntry) i.next())) x++;
if (addi((plasmaWordIndexEntry) i.next())) x++;
}
this.updateTime = java.lang.Math.max(this.updateTime, c.updateTime);
return x;
}
private boolean add(plasmaWordIndexEntry entry) {
private boolean addi(plasmaWordIndexEntry entry) {
// returns true if the new entry was added, false if it already existet
return (container.put(entry.getUrlHash(), entry) == null);
}
@ -117,10 +132,18 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
return container.containsKey(urlHash);
}
public plasmaWordIndexEntry get(String urlHash) {
return (plasmaWordIndexEntry) container.get(urlHash);
}
public plasmaWordIndexEntry[] getEntryArray() {
return (plasmaWordIndexEntry[]) container.values().toArray();
}
public plasmaWordIndexEntry remove(String urlHash) {
return (plasmaWordIndexEntry) container.remove(urlHash);
}
public Iterator entries() {
// returns an iterator of plasmaWordIndexEntry objects
return container.values().iterator();
@ -146,4 +169,126 @@ public final class plasmaWordIndexEntryContainer implements Comparable {
return (int) kelondroBase64Order.enhancedCoder.decodeLong(this.wordHash.substring(0, 4));
}
public static plasmaWordIndexEntryContainer joinContainer(Set containers, long time, int maxDistance) {
long stamp = System.currentTimeMillis();
// order entities by their size
TreeMap map = new TreeMap();
plasmaWordIndexEntryContainer singleContainer;
Iterator i = containers.iterator();
int count = 0;
while (i.hasNext()) {
// get next entity:
singleContainer = (plasmaWordIndexEntryContainer) i.next();
// check result
if ((singleContainer == null) || (singleContainer.size() == 0)) return new plasmaWordIndexEntryContainer(null); // as this is a cunjunction of searches, we have no result if any word is not known
// store result in order of result size
map.put(new Long(singleContainer.size() * 1000 + count), singleContainer);
count++;
}
// check if there is any result
if (map.size() == 0) return new plasmaWordIndexEntryContainer(null); // no result, nothing found
// the map now holds the search results in order of number of hits per word
// we now must pairwise build up a conjunction of these sets
Long k = (Long) map.firstKey(); // the smallest, which means, the one with the least entries
plasmaWordIndexEntryContainer searchA, searchB, searchResult = (plasmaWordIndexEntryContainer) map.remove(k);
while ((map.size() > 0) && (searchResult.size() > 0)) {
// take the first element of map which is a result and combine it with result
k = (Long) map.firstKey(); // the next smallest...
time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis();
searchA = searchResult;
searchB = (plasmaWordIndexEntryContainer) map.remove(k);
searchResult = plasmaWordIndexEntryContainer.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1), maxDistance);
// free resources
searchA = null;
searchB = null;
}
// in 'searchResult' is now the combined search result
if (searchResult.size() == 0) return new plasmaWordIndexEntryContainer(null);
return searchResult;
}
// join methods
private static int log2(int x) {
int l = 0;
while (x > 0) {x = x >> 1; l++;}
return l;
}
public static plasmaWordIndexEntryContainer joinConstructive(plasmaWordIndexEntryContainer i1, plasmaWordIndexEntryContainer i2, long time, int maxDistance) {
if ((i1 == null) || (i2 == null)) return null;
if ((i1.size() == 0) || (i2.size() == 0)) return new plasmaWordIndexEntryContainer(null);
// decide which method to use
int high = ((i1.size() > i2.size()) ? i1.size() : i2.size());
int low = ((i1.size() > i2.size()) ? i2.size() : i1.size());
int stepsEnum = 10 * (high + low - 1);
int stepsTest = 12 * log2(high) * low;
// start most efficient method
if (stepsEnum > stepsTest) {
if (i1.size() < i2.size())
return joinConstructiveByTest(i1, i2, time, maxDistance);
else
return joinConstructiveByTest(i2, i1, time, maxDistance);
} else {
return joinConstructiveByEnumeration(i1, i2, time, maxDistance);
}
}
private static plasmaWordIndexEntryContainer joinConstructiveByTest(plasmaWordIndexEntryContainer small, plasmaWordIndexEntryContainer large, long time, int maxDistance) {
System.out.println("DEBUG: JOIN METHOD BY TEST");
plasmaWordIndexEntryContainer conj = new plasmaWordIndexEntryContainer(null); // start with empty search result
Iterator se = small.entries();
plasmaWordIndexEntry ie0, ie1;
long stamp = System.currentTimeMillis();
while ((se.hasNext()) && ((System.currentTimeMillis() - stamp) < time)) {
ie0 = (plasmaWordIndexEntry) se.next();
ie1 = large.get(ie0.getUrlHash());
if (ie1 != null) {
// this is a hit. Calculate word distance:
ie0.combineDistance(ie1);
if (ie0.worddistance() <= maxDistance) conj.add(ie0);
}
}
return conj;
}
private static plasmaWordIndexEntryContainer joinConstructiveByEnumeration(plasmaWordIndexEntryContainer i1, plasmaWordIndexEntryContainer i2, long time, int maxDistance) {
System.out.println("DEBUG: JOIN METHOD BY ENUMERATION");
plasmaWordIndexEntryContainer conj = new plasmaWordIndexEntryContainer(null); // start with empty search result
Iterator e1 = i1.entries();
Iterator e2 = i2.entries();
int c;
if ((e1.hasNext()) && (e2.hasNext())) {
plasmaWordIndexEntry ie1;
plasmaWordIndexEntry ie2;
ie1 = (plasmaWordIndexEntry) e1.next();
ie2 = (plasmaWordIndexEntry) e2.next();
long stamp = System.currentTimeMillis();
while ((System.currentTimeMillis() - stamp) < time) {
c = ie1.getUrlHash().compareTo(ie2.getUrlHash());
if (c < 0) {
if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break;
} else if (c > 0) {
if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break;
} else {
// we have found the same urls in different searches!
ie1.combineDistance(ie2);
if (ie1.worddistance() <= maxDistance) conj.add(ie1);
if (e1.hasNext()) ie1 = (plasmaWordIndexEntry) e1.next(); else break;
if (e2.hasNext()) ie2 = (plasmaWordIndexEntry) e2.next(); else break;
}
}
}
return conj;
}
}

@ -50,7 +50,8 @@ public interface plasmaWordIndexInterface {
public Iterator wordHashes(String startWordHash, boolean up);
public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty, long maxTime);
public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime);
public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime);
public long getUpdateTime(String wordHash);
public void deleteIndex(String wordHash);

@ -50,12 +50,13 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpc;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.plasma.plasmaURLPattern;
@ -349,10 +350,11 @@ public final class yacyClient {
public static int search(
String wordhashes,
int maxDistance,
boolean global,
yacySeed targetPeer,
plasmaCrawlLURL urlManager,
plasmaWordIndexEntity entityCache,
plasmaWordIndexEntryContainer containerCache,
plasmaURLPattern blacklist,
plasmaSnippetCache snippets,
plasmaSearchProfile profile
@ -403,6 +405,7 @@ public final class yacyClient {
obj.put("ttl", "0");
obj.put("duetime", Long.toString(duetime));
obj.put("profile", profile.targetToString()); // new duetimes splitted by specific search tasks
obj.put("maxdist", maxDistance);
obj.put(yacySeed.MYTIME, yacyCore.universalDateShortString(new Date()));
//yacyCore.log.logDebug("yacyClient.search url=" + url);
@ -460,6 +463,9 @@ public final class yacyClient {
// get one single search result
urlEntry = urlManager.newEntry((String) result.get("resource" + n), true);
if (urlEntry != null && blacklist.isListed(urlEntry.url().getHost().toLowerCase(), urlEntry.url().getPath())) { continue; } // block with backlist
int urlLength = urlEntry.url().toString().length();
int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length;
urlManager.addEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
// save the url entry
final plasmaWordIndexEntry entry;
@ -467,6 +473,7 @@ public final class yacyClient {
// the old way to define words
entry = new plasmaWordIndexEntry(
urlEntry.hash(),
urlLength, urlComps,
urlEntry.wordCount(),
0, 0, 0, 0, 0, 0,
urlEntry.size(),
@ -494,7 +501,7 @@ public final class yacyClient {
}
// finally insert the containers to the index
for (int m = 0; m < words; m++) { entityCache.addEntries(container[m]); }
for (int m = 0; m < words; m++) { containerCache.add(container[m]); }
// generate statistics
long searchtime;
@ -841,7 +848,7 @@ public final class yacyClient {
httpHeader requestHeader) throws IOException {
*/
public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntity[] indexes, HashMap urlCache, boolean gzipBody, int timeout) {
public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntryContainer[] indexes, HashMap urlCache, boolean gzipBody, int timeout) {
HashMap in = transferRWI(targetSeed, indexes, gzipBody, timeout);
if (in == null) { return "no_connection_1"; }
@ -875,7 +882,7 @@ public final class yacyClient {
return null;
}
private static HashMap transferRWI(yacySeed targetSeed, plasmaWordIndexEntity[] indexes, boolean gzipBody, int timeout) {
private static HashMap transferRWI(yacySeed targetSeed, plasmaWordIndexEntryContainer[] indexes, boolean gzipBody, int timeout) {
final String address = targetSeed.getAddress();
if (address == null) { return null; }
@ -903,7 +910,7 @@ public final class yacyClient {
Iterator eenum;
plasmaWordIndexEntry entry;
for (int i = 0; i < indexes.length; i++) {
eenum = indexes[i].elements(true);
eenum = indexes[i].entries();
while (eenum.hasNext()) {
entry = (plasmaWordIndexEntry) eenum.next();
entrypost.append(indexes[i].wordHash())

@ -52,8 +52,8 @@ import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaURLPattern;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaSearchProfile;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.server.logging.serverLog;
public class yacySearch extends Thread {
@ -61,29 +61,31 @@ public class yacySearch extends Thread {
final private Set wordhashes;
final private boolean global;
final private plasmaCrawlLURL urlManager;
final private plasmaWordIndexEntity entityCache;
final private plasmaWordIndexEntryContainer containerCache;
final private plasmaURLPattern blacklist;
final private plasmaSnippetCache snippetCache;
final private yacySeed targetPeer;
private int links;
private int maxDistance;
final private plasmaSearchProfile profile;
public yacySearch(Set wordhashes, boolean global, yacySeed targetPeer,
plasmaCrawlLURL urlManager, plasmaWordIndexEntity entityCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchProfile profile) {
public yacySearch(Set wordhashes, int maxDistance, boolean global, yacySeed targetPeer,
plasmaCrawlLURL urlManager, plasmaWordIndexEntryContainer containerCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchProfile profile) {
super("yacySearch_" + targetPeer.getName());
this.wordhashes = wordhashes;
this.global = global;
this.urlManager = urlManager;
this.entityCache = entityCache;
this.containerCache = containerCache;
this.blacklist = blacklist;
this.snippetCache = snippetCache;
this.targetPeer = targetPeer;
this.links = -1;
this.maxDistance = maxDistance;
this.profile = (plasmaSearchProfile) profile.clone();
}
public void run() {
this.links = yacyClient.search(set2string(wordhashes), global, targetPeer, urlManager, entityCache, blacklist, snippetCache, profile);
this.links = yacyClient.search(set2string(wordhashes), maxDistance, global, targetPeer, urlManager, containerCache, blacklist, snippetCache, profile);
if (links != 0) {
//yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + links + " links for word hash " + wordhashes);
yacyCore.seedDB.mySeed.incRI(links);
@ -172,7 +174,7 @@ public class yacySearch extends Thread {
return result;
}
public static yacySearch[] searchHashes(Set wordhashes, plasmaCrawlLURL urlManager, plasmaWordIndexEntity entityCache,
public static yacySearch[] searchHashes(Set wordhashes, int maxDist, plasmaCrawlLURL urlManager, plasmaWordIndexEntryContainer containerCache,
int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, plasmaSearchProfile profile) {
// check own peer status
if (yacyCore.seedDB.mySeed == null || yacyCore.seedDB.mySeed.getAddress() == null) { return null; }
@ -185,8 +187,8 @@ public class yacySearch extends Thread {
if (targets == 0) return null;
yacySearch[] searchThreads = new yacySearch[targets];
for (int i = 0; i < targets; i++) {
searchThreads[i]= new yacySearch(wordhashes, true, targetPeers[i],
urlManager, entityCache, blacklist, snippetCache, profile);
searchThreads[i]= new yacySearch(wordhashes, maxDist, true, targetPeers[i],
urlManager, containerCache, blacklist, snippetCache, profile);
searchThreads[i].start();
try {Thread.sleep(20);} catch (InterruptedException e) {}
@ -216,5 +218,4 @@ public class yacySearch extends Thread {
}
}
}

Loading…
Cancel
Save