- refactoring of indexControlRWIs: moved statics to own class; better Dublin Core naming

- fix for http://forum.yacy-websuche.de/viewtopic.php?f=5&t=759&hilit=&p=4866#p4866
- some bugfixes in EcoTable according remove method
- switched more tables to Eco: crawl Profiles, htcache, seeddb, newsdb

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4397 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 2dc994515d
commit 15397298dc

@ -40,10 +40,10 @@
<td colspan="1">document type</td>
<tr class="TableCellDark">
<td style="background-color:#FFFFFF">&nbsp;</td>
<td>reference</td>
<td>description</td>
<td>author</td>
<td>tags</td>
<td>title</td>
<td>creator</td>
<td>subject</td>
<td>url</td>
<td>emphasized</td>
<td>image</td>
@ -55,10 +55,10 @@
<tr class="TableCellDark">
<td style="background-color:#FFFFFF">&nbsp;</td>
<td>#[allurl]#</td>
<td>#[reference]#</td>
<td>#[description]#</td>
<td>#[author]#</td>
<td>#[tag]#</td>
<td>#[title]#</td>
<td>#[creator]#</td>
<td>#[subject]#</td>
<td>#[url]#</td>
<td>#[emphasized]#</td>
<td>#[image]#</td>
@ -70,10 +70,10 @@
<tr class="TableCellLight">
<td class="TableCellDark">Selection</td>
<td><input type="checkbox" name="allurl" id="allurl" checked="checked" /></td>
<td><input type="checkbox" name="reference" onclick="document.selection.allurl.checked=false" /></td>
<td><input type="checkbox" name="description" onclick="document.selection.allurl.checked=false" /></td>
<td><input type="checkbox" name="author" onclick="document.selection.allurl.checked=false" /></td>
<td><input type="checkbox" name="tag" onclick="document.selection.allurl.checked=false" /></td>
<td><input type="checkbox" name="title" onclick="document.selection.allurl.checked=false" /></td>
<td><input type="checkbox" name="creator" onclick="document.selection.allurl.checked=false" /></td>
<td><input type="checkbox" name="subject" onclick="document.selection.allurl.checked=false" /></td>
<td><input type="checkbox" name="url" onclick="document.selection.allurl.checked=false" /></td>
<td><input type="checkbox" name="emphasized" onclick="document.selection.allurl.checked=false" /></td>
<td><input type="checkbox" name="image" onclick="document.selection.allurl.checked=false" /></td>

@ -29,7 +29,6 @@ import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
@ -43,13 +42,11 @@ import de.anomic.index.indexRWIRowEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaSearchAPI;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProcess;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.abstractURLPattern;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverDate;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyClient;
@ -92,7 +89,7 @@ public class IndexControlRWIs_p {
if (post.containsKey("keystringsearch")) {
keyhash = plasmaCondenser.word2hash(keystring);
prop.put("keyhash", keyhash);
final plasmaSearchRankingProcess ranking = genSearchresult(prop, sb, keyhash, null, sortorder, false);
final plasmaSearchRankingProcess ranking = plasmaSearchAPI.genSearchresult(prop, sb, keyhash, null, sortorder, false);
if (ranking.filteredCount() == 0) {
prop.put("searchresult", 1);
prop.put("searchresult_word", keystring);
@ -103,7 +100,7 @@ public class IndexControlRWIs_p {
if (keystring.length() == 0 || !plasmaCondenser.word2hash(keystring).equals(keyhash)) {
prop.put("keystring", "&lt;not possible to compute word from hash&gt;");
}
final plasmaSearchRankingProcess ranking = genSearchresult(prop, sb, keyhash, null, sortorder, false);
final plasmaSearchRankingProcess ranking = plasmaSearchAPI.genSearchresult(prop, sb, keyhash, null, sortorder, false);
if (ranking.filteredCount() == 0) {
prop.put("searchresult", 2);
prop.put("searchresult_wordhash", keyhash);
@ -160,10 +157,10 @@ public class IndexControlRWIs_p {
if (keystring.length() == 0 || !plasmaCondenser.word2hash(keystring).equals(keyhash)) {
prop.put("keystring", "&lt;not possible to compute word from hash&gt;");
}
kelondroBitfield flags = compileFlags(post);
kelondroBitfield flags = plasmaSearchAPI.compileFlags(post);
int count = (post.get("lines", "all").equals("all")) ? -1 : post.getInt("lines", -1);
final plasmaSearchRankingProcess ranking = genSearchresult(prop, sb, keyhash, flags, sortorder, true);
genURLList(prop, keyhash, keystring, ranking, flags, count, sortorder);
final plasmaSearchRankingProcess ranking = plasmaSearchAPI.genSearchresult(prop, sb, keyhash, flags, sortorder, true);
plasmaSearchAPI.genURLList(prop, keyhash, keystring, ranking, flags, count, sortorder);
}
// transfer to other peer
@ -307,7 +304,7 @@ public class IndexControlRWIs_p {
sb.wordIndex.removeEntries(keyhash, urlHashes);
}
if (prop.getInt("searchresult", 0) == 3) listHosts(prop, keyhash);
if (prop.getInt("searchresult", 0) == 3) plasmaSearchAPI.listHosts(prop, keyhash);
}
@ -317,161 +314,4 @@ public class IndexControlRWIs_p {
return prop;
}
private static kelondroBitfield compileFlags(serverObjects post) {
kelondroBitfield b = new kelondroBitfield(4);
if (post.get("allurl", "").equals("on")) return null;
if (post.get("flags") != null) {
if (post.get("flags","").length() == 0) return null;
return new kelondroBitfield(4, (String) post.get("flags"));
}
if (post.get("reference", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_description, true);
if (post.get("description", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_title, true);
if (post.get("author", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_creator, true);
if (post.get("tag", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_subject, true);
if (post.get("url", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_identifier, true);
if (post.get("emphasized", "").equals("on")) b.set(indexRWIEntry.flag_app_emphasized, true);
if (post.get("image", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasimage, true);
if (post.get("audio", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasaudio, true);
if (post.get("video", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasvideo, true);
if (post.get("app", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasapp, true);
if (post.get("indexof", "").equals("on")) b.set(plasmaCondenser.flag_cat_indexof, true);
return b;
}
private static void listHosts(serverObjects prop, String startHash) {
// list known hosts
yacySeed seed;
int hc = 0;
prop.put("searchresult_keyhash", startHash);
if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) {
Iterator<yacySeed> e = yacyCore.dhtAgent.getAcceptRemoteIndexSeeds(startHash);
while (e.hasNext()) {
seed = (yacySeed) e.next();
if (seed != null) {
prop.put("searchresult_hosts_" + hc + "_hosthash", seed.hash);
prop.putHTML("searchresult_hosts_" + hc + "_hostname", seed.hash + " " + seed.get(yacySeed.NAME, "nameless"));
hc++;
}
}
prop.put("searchresult_hosts", hc);
} else {
prop.put("searchresult_hosts", "0");
}
}
private static plasmaSearchRankingProcess genSearchresult(serverObjects prop, plasmaSwitchboard sb, String keyhash, kelondroBitfield filter, int sortorder, boolean fetchURLs) {
plasmaSearchQuery query = new plasmaSearchQuery(keyhash, -1, sb.getRanking(), filter);
plasmaSearchRankingProcess ranked = new plasmaSearchRankingProcess(sb.wordIndex, query, sortorder, Integer.MAX_VALUE);
ranked.execQuery(fetchURLs);
if (ranked.filteredCount() == 0) {
prop.put("searchresult", 2);
prop.put("searchresult_wordhash", keyhash);
} else {
prop.put("searchresult", 3);
prop.put("searchresult_allurl", ranked.filteredCount());
prop.put("searchresult_reference", ranked.flagCount()[indexRWIEntry.flag_app_dc_description]);
prop.put("searchresult_description", ranked.flagCount()[indexRWIEntry.flag_app_dc_title]);
prop.put("searchresult_author", ranked.flagCount()[indexRWIEntry.flag_app_dc_creator]);
prop.put("searchresult_tag", ranked.flagCount()[indexRWIEntry.flag_app_dc_subject]);
prop.put("searchresult_url", ranked.flagCount()[indexRWIEntry.flag_app_dc_identifier]);
prop.put("searchresult_emphasized", ranked.flagCount()[indexRWIEntry.flag_app_emphasized]);
prop.put("searchresult_image", ranked.flagCount()[plasmaCondenser.flag_cat_hasimage]);
prop.put("searchresult_audio", ranked.flagCount()[plasmaCondenser.flag_cat_hasaudio]);
prop.put("searchresult_video", ranked.flagCount()[plasmaCondenser.flag_cat_hasvideo]);
prop.put("searchresult_app", ranked.flagCount()[plasmaCondenser.flag_cat_hasapp]);
prop.put("searchresult_indexof", ranked.flagCount()[plasmaCondenser.flag_cat_indexof]);
}
return ranked;
}
private static void genURLList(serverObjects prop, String keyhash, String keystring, plasmaSearchRankingProcess ranked, kelondroBitfield flags, int maxlines, int ordering) {
// search for a word hash and generate a list of url links
prop.put("genUrlList_keyHash", keyhash);
if (ranked.filteredCount() == 0) {
prop.put("genUrlList", 1);
prop.put("genUrlList_count", 0);
prop.put("searchresult", 2);
} else {
prop.put("genUrlList", 2);
prop.put("searchresult", 3);
prop.put("genUrlList_flags", (flags == null) ? "" : flags.exportB64());
prop.put("genUrlList_lines", maxlines);
prop.put("genUrlList_ordering", ordering);
int i = 0;
yacyURL url;
indexURLEntry entry;
String us;
long rn = -1;
while ((ranked.size() > 0) && ((entry = ranked.bestURL(false)) != null)) {
if ((entry == null) || (entry.comp() == null)) continue;
url = entry.comp().url();
if (url == null) continue;
us = url.toNormalform(false, false);
if (rn == -1) rn = entry.ranking();
prop.put("genUrlList_urlList_"+i+"_urlExists", "1");
prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxCount", i);
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", entry.word().urlHash());
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_keyString", keystring);
prop.put("genUrlList_urlList_"+i+"_urlExists_keyHash", keyhash);
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlString", us);
prop.put("genUrlList_urlList_"+i+"_urlExists_urlStringShort", (us.length() > 40) ? (us.substring(0, 20) + "<br>" + us.substring(20, 40) + "...") : ((us.length() > 30) ? (us.substring(0, 20) + "<br>" + us.substring(20)) : us));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_ranking", (entry.ranking() - rn));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_domlength", yacyURL.domLengthEstimation(entry.hash()));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_ybr", plasmaSearchRankingProcess.ybr(entry.hash()));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_authority", ranked.getOrder().authority(entry.hash()));
prop.put("genUrlList_urlList_"+i+"_urlExists_date", serverDate.formatShortDay(new Date(entry.word().lastModified())));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintitle", entry.word().wordsintitle());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintext", entry.word().wordsintext());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_phrasesintext", entry.word().phrasesintext());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_llocal", entry.word().llocal());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_lother", entry.word().lother());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_hitcount", entry.word().hitcount());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_worddistance", entry.word().worddistance());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_pos", entry.word().posintext());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_phrase", entry.word().posofphrase());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_posinphrase", entry.word().posinphrase());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_urlcomps", entry.word().urlcomps());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_urllength", entry.word().urllength());
prop.put("genUrlList_urlList_"+i+"_urlExists_props",
((entry.word().flags().get(plasmaCondenser.flag_cat_indexof)) ? "appears on index page, " : "") +
((entry.word().flags().get(plasmaCondenser.flag_cat_hasimage)) ? "contains images, " : "") +
((entry.word().flags().get(plasmaCondenser.flag_cat_hasaudio)) ? "contains audio, " : "") +
((entry.word().flags().get(plasmaCondenser.flag_cat_hasvideo)) ? "contains video, " : "") +
((entry.word().flags().get(plasmaCondenser.flag_cat_hasapp)) ? "contains applications, " : "") +
((entry.word().flags().get(indexRWIEntry.flag_app_dc_identifier)) ? "appears in url, " : "") +
((entry.word().flags().get(indexRWIEntry.flag_app_dc_title)) ? "appears in description, " : "") +
((entry.word().flags().get(indexRWIEntry.flag_app_dc_creator)) ? "appears in author, " : "") +
((entry.word().flags().get(indexRWIEntry.flag_app_dc_subject)) ? "appears in tags, " : "") +
((entry.word().flags().get(indexRWIEntry.flag_app_dc_description)) ? "appears in reference, " : "") +
((entry.word().flags().get(indexRWIEntry.flag_app_emphasized)) ? "appears emphasized, " : "") +
((yacyURL.probablyRootURL(entry.word().urlHash())) ? "probably root url" : "")
);
if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, url)) {
prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxChecked", "1");
}
i++;
if ((maxlines >= 0) && (i >= maxlines)) break;
}
Iterator<String> iter = ranked.miss(); // iterates url hash strings
while (iter.hasNext()) {
us = (String) iter.next();
prop.put("genUrlList_urlList_"+i+"_urlExists", "0");
prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxCount", i);
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", us);
i++;
}
prop.put("genUrlList_urlList", i);
prop.putHTML("genUrlList_keyString", keystring);
prop.put("genUrlList_count", i);
putBlacklists(prop, listManager.getDirListing(listManager.listsPath));
}
}
private static void putBlacklists(serverObjects prop, String[] lists) {
prop.put("genUrlList_blacklists", lists.length);
for (int i=0; i<lists.length; i++)
prop.put("genUrlList_blacklists_" + i + "_name", lists[i]);
}
}

@ -417,11 +417,8 @@ public final class httpc {
// if we reached this point, we should have a connection
} catch (UnknownHostException e) {
if (this.socket != null) {
// no need to track this, the socket cannot be established
synchronized (activeConnections) {activeConnections.remove(this);}
}
this.socket = null;
serverLog.logFine("HTTPC", "Couldn't find host " + server);
close();
throw new IOException("unknown host: " + server);
} catch (IOException e) {
// There was an error while connecting the socket, probably a SocketTimeoutException

@ -105,7 +105,7 @@ public class kelondroDyn {
fbi = new kelondroEcoTable(file, rowdef, kelondroEcoTable.tailCacheUsageAuto, EcoFSBufferSize, 0);
}
}
this.index = (useObjectCache) ? (kelondroIndex) new kelondroCache(fbi) : fbi;
this.index = ((useObjectCache) && (!(fbi instanceof kelondroEcoTable))) ? (kelondroIndex) new kelondroCache(fbi) : fbi;
this.keylen = key;
this.reclen = nodesize;
this.fillChar = fillChar;
@ -345,7 +345,7 @@ public class kelondroDyn {
int recpos = 0;
byte[] k;
while (index.get(k = dynKey(key, recpos)) != null) {
index.remove(k, true);
index.remove(k, false);
buffer.remove(k);
recpos++;
}
@ -520,4 +520,4 @@ public class kelondroDyn {
return -1;
}
}
}
}

@ -119,6 +119,7 @@ public class kelondroEcoTable implements kelondroIndex {
byte[] record = new byte[rowdef.objectsize];
byte[] key = new byte[rowdef.primaryKeyLength];
int fs = (int) file.size();
System.out.print("*** initializing RAM index for EcoTable " + tablefile + ":");
for (int i = 0; i < fs; i++) {
// read entry
file.get(i, record, 0);
@ -129,9 +130,17 @@ public class kelondroEcoTable implements kelondroIndex {
// write the tail into the table
if (table != null) table.addUnique(taildef.newEntry(record, rowdef.primaryKeyLength, true));
if ((i % 10000) == 0) {
System.out.print('.');
System.out.flush();
}
}
System.out.print(" -ordering- ..");
System.out.flush();
// check consistency
ArrayList<Integer[]> doubles = index.removeDoubles();
System.out.println(" -removed " + doubles.size() + " doubles- done.");
if (doubles.size() > 0) {
System.out.println("DEBUG " + tablefile + ": WARNING - EcoTable " + tablefile + " has " + doubles.size() + " doubles");
// from all the doubles take one, put it back to the index and remove the others from the file
@ -392,51 +401,57 @@ public class kelondroEcoTable implements kelondroIndex {
assert file.size() == index.size() : "file.size() = " + file.size() + ", index.size() = " + index.size();
assert ((table == null) || (table.size() == index.size()));
assert keepOrder == false; // this class cannot keep the order during a remove
assert key.length == rowdef.primaryKeyLength;
int i = index.geti(key);
if (i == -1) return null; // nothing to do
// prepare result
byte[] b = new byte[rowdef.objectsize];
byte[] p = new byte[rowdef.objectsize];
int sb = index.size();
if (table == null) {
index.removei(key);
file.get(i, b, 0);
file.cleanLast(p, 0);
file.put(i, p, 0);
byte[] k = new byte[rowdef.primaryKeyLength];
System.arraycopy(p, 0, k, 0, rowdef.primaryKeyLength);
index.puti(k, i);
if (i == index.size() - 1) {
index.removei(key);
file.clean(i, b, 0);
} else {
index.removei(key);
file.get(i, b, 0);
file.cleanLast(p, 0);
file.put(i, p, 0);
byte[] k = new byte[rowdef.primaryKeyLength];
System.arraycopy(p, 0, k, 0, rowdef.primaryKeyLength);
index.puti(k, i);
}
assert (file.size() == index.size());
assert ((table == null) || (table.size() == index.size()));
} else {
// get result value from the table copy, so we don't need to read it from the file
kelondroRow.Entry v = table.get(i);
assert key.length == rowdef.primaryKeyLength;
System.arraycopy(key, 0, b, 0, key.length);
System.arraycopy(v.bytes(), 0, b, rowdef.primaryKeyLength, taildef.objectsize);
if (i == index.size() - 1) {
// special handling if the entry is the last entry in the file
index.removei(key);
table.removeRow(i, false);
file.clean(i);
assert (file.size() == index.size());
assert ((table == null) || (table.size() == index.size()));
} else {
// switch values
index.removei(key);
kelondroRow.Entry te = table.removeOne();
table.set(i, te);
file.cleanLast(p, 0);
file.put(i, p, 0);
kelondroRow.Entry lr = rowdef.newEntry(p);
index.removei(key);
index.puti(lr.getPrimaryKeyBytes(), i);
assert (file.size() == index.size());
assert ((table == null) || (table.size() == index.size())) : "table.size() = " + table.size() + ", index.size() = " + index.size();
}
assert (file.size() == index.size());
assert (table.size() == index.size()) : "table.size() = " + table.size() + ", index.size() = " + index.size();
}
assert file.size() == index.size() : "file.size() = " + file.size() + ", index.size() = " + index.size();
assert ((table == null) || (table.size() == index.size()));
assert index.size() + 1 == sb : "index.size() = " + index.size() + ", sb = " + sb;
return rowdef.newEntry(b);
}
@ -448,7 +463,7 @@ public class kelondroEcoTable implements kelondroIndex {
kelondroRow.Entry lr = rowdef.newEntry(le);
int i = index.removei(lr.getPrimaryKeyBytes());
assert i >= 0;
table.removeRow(i, false);
if (table != null) table.removeOne();
assert file.size() == index.size() : "file.size() = " + file.size() + ", index.size() = " + index.size();
return lr;
}

@ -30,12 +30,14 @@ public class kelondroRotateIterator<E> implements kelondroCloneableIterator<E> {
kelondroCloneableIterator<E> a, clone;
Object modifier;
boolean nempty;
public kelondroRotateIterator(kelondroCloneableIterator<E> a, Object modifier) {
// this works currently only for String-type key iterations
this.a = a;
this.modifier = modifier;
this.clone = (kelondroCloneableIterator<E>) a.clone(modifier);
this.nempty = this.clone.hasNext();
}
public kelondroRotateIterator<E> clone(Object modifier) {
@ -43,7 +45,7 @@ public class kelondroRotateIterator<E> implements kelondroCloneableIterator<E> {
}
public boolean hasNext() {
return true;
return this.nempty;
}
public E next() {
@ -52,6 +54,7 @@ public class kelondroRotateIterator<E> implements kelondroCloneableIterator<E> {
// from the hasNext() method
if (!(a.hasNext())) {
a = (kelondroCloneableIterator<E>) clone.clone(modifier);
assert a.hasNext();
}
return a.next();
}

@ -15,7 +15,7 @@ import de.anomic.plasma.plasmaSwitchboard;
public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImporter {
private File plasmaPath = null;
private HashSet importProfileHandleCache = new HashSet();
private HashSet<String> importProfileHandleCache = new HashSet<String>();
private plasmaCrawlProfile importProfileDB;
private plasmaCrawlNURL importNurlDB;
private int importStartSize;
@ -129,8 +129,8 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
this.log.logInfo("Starting to import '" + this.importNurlDB.size() + "' entries not available in any stack.");
}
// getting an interator and loop through the URL entries
Iterator entryIter = (stackTypes[stackType] == -1) ? this.importNurlDB.iterator(stackType) : null;
// getting an iterator and loop through the URL entries
Iterator<plasmaCrawlEntry> entryIter = (stackTypes[stackType] == -1) ? this.importNurlDB.iterator(stackType) : null;
while (true) {
String nextHash = null;
@ -147,7 +147,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
if (!entryIter.hasNext()) break;
this.urlCount++;
nextEntry = (plasmaCrawlEntry) entryIter.next();
nextEntry = entryIter.next();
nextHash = nextEntry.url().hash();
}
} catch (IOException e) {

@ -70,7 +70,7 @@ public class plasmaCrawlProfile {
this.profileTableFile = file;
this.preloadTime = preloadTime;
profileTableFile.getParentFile().mkdirs();
kelondroDyn dyn = new kelondroDyn(profileTableFile, true, true, preloadTime, yacySeedDB.commonHashLength, 2000, '#', kelondroNaturalOrder.naturalOrder, true, false, true);
kelondroDyn dyn = new kelondroDyn(profileTableFile, true, true, preloadTime, yacySeedDB.commonHashLength, 2000, '#', kelondroNaturalOrder.naturalOrder, false, false, true);
profileTable = new kelondroMapObjects(dyn, 500);
}
@ -79,7 +79,7 @@ public class plasmaCrawlProfile {
if (profileTable != null) profileTable.close();
if (!(profileTableFile.delete())) throw new RuntimeException("cannot delete crawl profile database");
profileTableFile.getParentFile().mkdirs();
kelondroDyn dyn = new kelondroDyn(profileTableFile, true, true, preloadTime, yacySeedDB.commonHashLength, 2000, '#', kelondroNaturalOrder.naturalOrder, true, false, true);
kelondroDyn dyn = new kelondroDyn(profileTableFile, true, true, preloadTime, yacySeedDB.commonHashLength, 2000, '#', kelondroNaturalOrder.naturalOrder, false, false, true);
profileTable = new kelondroMapObjects(dyn, 500);
}

@ -72,7 +72,7 @@ public class plasmaCrawlRobotsTxt {
this.robotsTableFile = robotsTableFile;
this.preloadTime = preloadTime;
robotsTableFile.getParentFile().mkdirs();
robotsTable = new kelondroMapObjects(new kelondroDyn(robotsTableFile, true, true, preloadTime, 256, 512, '_', kelondroNaturalOrder.naturalOrder, true, false, true), 100);
robotsTable = new kelondroMapObjects(new kelondroDyn(robotsTableFile, true, true, preloadTime, 256, 512, '_', kelondroNaturalOrder.naturalOrder, false, false, true), 100);
}
private void resetDatabase() {
@ -80,7 +80,7 @@ public class plasmaCrawlRobotsTxt {
if (robotsTable != null) robotsTable.close();
if (!(robotsTableFile.delete())) throw new RuntimeException("cannot delete robots.txt database");
robotsTableFile.getParentFile().mkdirs();
robotsTable = new kelondroMapObjects(new kelondroDyn(robotsTableFile, true, true, preloadTime, 256, 512, '_', kelondroNaturalOrder.naturalOrder, true, false, true), 100);
robotsTable = new kelondroMapObjects(new kelondroDyn(robotsTableFile, true, true, preloadTime, 256, 512, '_', kelondroNaturalOrder.naturalOrder, false, false, true), 100);
}
public void close() {
@ -103,7 +103,7 @@ public class plasmaCrawlRobotsTxt {
public Entry getEntry(String hostName) {
try {
Map record = this.robotsTable.getMap(hostName);
Map<String, String> record = this.robotsTable.getMap(hostName);
if (record == null) return null;
return new Entry(hostName, record);
} catch (kelondroException e) {
@ -114,14 +114,16 @@ public class plasmaCrawlRobotsTxt {
public Entry addEntry(
String hostName,
ArrayList disallowPathList,
ArrayList<String> disallowPathList,
Date loadedDate,
Date modDate,
String eTag,
String sitemap,
Integer crawlDelay
) {
Entry entry = new Entry(hostName,disallowPathList,loadedDate,modDate,eTag,sitemap,crawlDelay);
Entry entry = new Entry(
hostName, disallowPathList, loadedDate, modDate,
eTag, sitemap, crawlDelay);
addEntry(entry);
return entry;
}
@ -129,7 +131,7 @@ public class plasmaCrawlRobotsTxt {
public String addEntry(Entry entry) {
// writes a new page and returns key
try {
this.robotsTable.set(entry.hostName,entry.mem);
this.robotsTable.set(entry.hostName, entry.mem);
return entry.hostName;
} catch (IOException e) {
return null;
@ -145,16 +147,16 @@ public class plasmaCrawlRobotsTxt {
public static final String CRAWL_DELAY = "crawlDelay";
// this is a simple record structure that hold all properties of a single crawl start
Map mem;
private LinkedList disallowPathList;
Map<String, String> mem;
private LinkedList<String> disallowPathList;
String hostName;
public Entry(String hostName, Map mem) {
public Entry(String hostName, Map<String, String> mem) {
this.hostName = hostName.toLowerCase();
this.mem = mem;
if (this.mem.containsKey(DISALLOW_PATH_LIST)) {
this.disallowPathList = new LinkedList();
this.disallowPathList = new LinkedList<String>();
String csPl = (String) this.mem.get(DISALLOW_PATH_LIST);
if (csPl.length() > 0){
String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR);
@ -163,13 +165,13 @@ public class plasmaCrawlRobotsTxt {
}
}
} else {
this.disallowPathList = new LinkedList();
this.disallowPathList = new LinkedList<String>();
}
}
public Entry(
String hostName,
ArrayList disallowPathList,
ArrayList<String> disallowPathList,
Date loadedDate,
Date modDate,
String eTag,
@ -179,9 +181,9 @@ public class plasmaCrawlRobotsTxt {
if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException("The hostname is missing");
this.hostName = hostName.trim().toLowerCase();
this.disallowPathList = new LinkedList();
this.disallowPathList = new LinkedList<String>();
this.mem = new HashMap(5);
this.mem = new HashMap<String, String>(5);
if (loadedDate != null) this.mem.put(LOADED_DATE,Long.toString(loadedDate.getTime()));
if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime()));
if (eTag != null) this.mem.put(ETAG,eTag);
@ -259,9 +261,9 @@ public class plasmaCrawlRobotsTxt {
else path = path.replaceAll(ROBOTS_DB_PATH_SEPARATOR,"%3B");
Iterator pathIter = this.disallowPathList.iterator();
Iterator<String> pathIter = this.disallowPathList.iterator();
while (pathIter.hasNext()) {
String nextPath = (String) pathIter.next();
String nextPath = pathIter.next();
// allow rule
if (nextPath.startsWith("!") && nextPath.length() > 1 && path.startsWith(nextPath.substring(1))) {
return false;

@ -93,7 +93,7 @@ import de.anomic.yacy.yacyURL;
public final class plasmaHTCache {
public static final String DB_NAME = "responseHeader1.db";
public static final String DB_NAME = "responseHeader2.db";
private static final int stackLimit = 150; // if we exceed that limit, we do not check idle
public static final long oneday = 1000 * 60 * 60 * 24; // milliseconds of a day
@ -307,7 +307,7 @@ public final class plasmaHTCache {
private static void openResponseHeaderDB(long preloadTime) {
// open the response header database
File dbfile = new File(cachePath, DB_NAME);
responseHeaderDB = new kelondroMapObjects(new kelondroDyn(dbfile, true, true, preloadTime, yacySeedDB.commonHashLength, 150, '#', kelondroBase64Order.enhancedCoder, true, false, true), 500);
responseHeaderDB = new kelondroMapObjects(new kelondroDyn(dbfile, true, true, preloadTime, yacySeedDB.commonHashLength, 150, '#', kelondroBase64Order.enhancedCoder, false, false, true), 500);
}
private static void deleteOldHTCache(File directory) {

@ -0,0 +1,206 @@
// plasmaSearchAPI.java
// -----------------------
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2007-11-14 01:15:28 +0000 (Mi, 14 Nov 2007) $
// $LastChangedRevision: 4216 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma;
import java.util.Date;
import java.util.Iterator;
import de.anomic.data.listManager;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverDate;
import de.anomic.server.serverObjects;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacyURL;
public class plasmaSearchAPI {
// collection of static methods for a search servlet. Exists only to prevent that the same processes are defined more than once.
public static kelondroBitfield compileFlags(serverObjects post) {
kelondroBitfield b = new kelondroBitfield(4);
if (post.get("allurl", "").equals("on")) return null;
if (post.get("flags") != null) {
if (post.get("flags","").length() == 0) return null;
return new kelondroBitfield(4, (String) post.get("flags"));
}
if (post.get("description", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_description, true);
if (post.get("title", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_title, true);
if (post.get("creator", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_creator, true);
if (post.get("subject", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_subject, true);
if (post.get("url", "").equals("on")) b.set(indexRWIEntry.flag_app_dc_identifier, true);
if (post.get("emphasized", "").equals("on")) b.set(indexRWIEntry.flag_app_emphasized, true);
if (post.get("image", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasimage, true);
if (post.get("audio", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasaudio, true);
if (post.get("video", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasvideo, true);
if (post.get("app", "").equals("on")) b.set(plasmaCondenser.flag_cat_hasapp, true);
if (post.get("indexof", "").equals("on")) b.set(plasmaCondenser.flag_cat_indexof, true);
return b;
}
public static void listHosts(serverObjects prop, String startHash) {
// list known hosts
yacySeed seed;
int hc = 0;
prop.put("searchresult_keyhash", startHash);
if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) {
Iterator<yacySeed> e = yacyCore.dhtAgent.getAcceptRemoteIndexSeeds(startHash);
while (e.hasNext()) {
seed = (yacySeed) e.next();
if (seed != null) {
prop.put("searchresult_hosts_" + hc + "_hosthash", seed.hash);
prop.putHTML("searchresult_hosts_" + hc + "_hostname", seed.hash + " " + seed.get(yacySeed.NAME, "nameless"));
hc++;
}
}
prop.put("searchresult_hosts", hc);
} else {
prop.put("searchresult_hosts", "0");
}
}
public static plasmaSearchRankingProcess genSearchresult(serverObjects prop, plasmaSwitchboard sb, String keyhash, kelondroBitfield filter, int sortorder, boolean fetchURLs) {
plasmaSearchQuery query = new plasmaSearchQuery(keyhash, -1, sb.getRanking(), filter);
plasmaSearchRankingProcess ranked = new plasmaSearchRankingProcess(sb.wordIndex, query, sortorder, Integer.MAX_VALUE);
ranked.execQuery(fetchURLs);
if (ranked.filteredCount() == 0) {
prop.put("searchresult", 2);
prop.put("searchresult_wordhash", keyhash);
} else {
prop.put("searchresult", 3);
prop.put("searchresult_allurl", ranked.filteredCount());
prop.put("searchresult_description", ranked.flagCount()[indexRWIEntry.flag_app_dc_description]);
prop.put("searchresult_title", ranked.flagCount()[indexRWIEntry.flag_app_dc_title]);
prop.put("searchresult_creator", ranked.flagCount()[indexRWIEntry.flag_app_dc_creator]);
prop.put("searchresult_subject", ranked.flagCount()[indexRWIEntry.flag_app_dc_subject]);
prop.put("searchresult_url", ranked.flagCount()[indexRWIEntry.flag_app_dc_identifier]);
prop.put("searchresult_emphasized", ranked.flagCount()[indexRWIEntry.flag_app_emphasized]);
prop.put("searchresult_image", ranked.flagCount()[plasmaCondenser.flag_cat_hasimage]);
prop.put("searchresult_audio", ranked.flagCount()[plasmaCondenser.flag_cat_hasaudio]);
prop.put("searchresult_video", ranked.flagCount()[plasmaCondenser.flag_cat_hasvideo]);
prop.put("searchresult_app", ranked.flagCount()[plasmaCondenser.flag_cat_hasapp]);
prop.put("searchresult_indexof", ranked.flagCount()[plasmaCondenser.flag_cat_indexof]);
}
return ranked;
}
public static void genURLList(serverObjects prop, String keyhash, String keystring, plasmaSearchRankingProcess ranked, kelondroBitfield flags, int maxlines, int ordering) {
// search for a word hash and generate a list of url links
prop.put("genUrlList_keyHash", keyhash);
if (ranked.filteredCount() == 0) {
prop.put("genUrlList", 1);
prop.put("genUrlList_count", 0);
prop.put("searchresult", 2);
} else {
prop.put("genUrlList", 2);
prop.put("searchresult", 3);
prop.put("genUrlList_flags", (flags == null) ? "" : flags.exportB64());
prop.put("genUrlList_lines", maxlines);
prop.put("genUrlList_ordering", ordering);
int i = 0;
yacyURL url;
indexURLEntry entry;
String us;
long rn = -1;
while ((ranked.size() > 0) && ((entry = ranked.bestURL(false)) != null)) {
if ((entry == null) || (entry.comp() == null)) continue;
url = entry.comp().url();
if (url == null) continue;
us = url.toNormalform(false, false);
if (rn == -1) rn = entry.ranking();
prop.put("genUrlList_urlList_"+i+"_urlExists", "1");
prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxCount", i);
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", entry.word().urlHash());
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_keyString", keystring);
prop.put("genUrlList_urlList_"+i+"_urlExists_keyHash", keyhash);
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlString", us);
prop.put("genUrlList_urlList_"+i+"_urlExists_urlStringShort", (us.length() > 40) ? (us.substring(0, 20) + "<br>" + us.substring(20, 40) + "...") : ((us.length() > 30) ? (us.substring(0, 20) + "<br>" + us.substring(20)) : us));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_ranking", (entry.ranking() - rn));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_domlength", yacyURL.domLengthEstimation(entry.hash()));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_ybr", plasmaSearchRankingProcess.ybr(entry.hash()));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_authority", ranked.getOrder().authority(entry.hash()));
prop.put("genUrlList_urlList_"+i+"_urlExists_date", serverDate.formatShortDay(new Date(entry.word().lastModified())));
prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintitle", entry.word().wordsintitle());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintext", entry.word().wordsintext());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_phrasesintext", entry.word().phrasesintext());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_llocal", entry.word().llocal());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_lother", entry.word().lother());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_hitcount", entry.word().hitcount());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_worddistance", entry.word().worddistance());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_pos", entry.word().posintext());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_phrase", entry.word().posofphrase());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_posinphrase", entry.word().posinphrase());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_urlcomps", entry.word().urlcomps());
prop.putNum("genUrlList_urlList_"+i+"_urlExists_urllength", entry.word().urllength());
prop.put("genUrlList_urlList_"+i+"_urlExists_props",
((entry.word().flags().get(plasmaCondenser.flag_cat_indexof)) ? "appears on index page, " : "") +
((entry.word().flags().get(plasmaCondenser.flag_cat_hasimage)) ? "contains images, " : "") +
((entry.word().flags().get(plasmaCondenser.flag_cat_hasaudio)) ? "contains audio, " : "") +
((entry.word().flags().get(plasmaCondenser.flag_cat_hasvideo)) ? "contains video, " : "") +
((entry.word().flags().get(plasmaCondenser.flag_cat_hasapp)) ? "contains applications, " : "") +
((entry.word().flags().get(indexRWIEntry.flag_app_dc_identifier)) ? "appears in url, " : "") +
((entry.word().flags().get(indexRWIEntry.flag_app_dc_title)) ? "appears in title, " : "") +
((entry.word().flags().get(indexRWIEntry.flag_app_dc_creator)) ? "appears in author, " : "") +
((entry.word().flags().get(indexRWIEntry.flag_app_dc_subject)) ? "appears in subject, " : "") +
((entry.word().flags().get(indexRWIEntry.flag_app_dc_description)) ? "appears in description, " : "") +
((entry.word().flags().get(indexRWIEntry.flag_app_emphasized)) ? "appears emphasized, " : "") +
((yacyURL.probablyRootURL(entry.word().urlHash())) ? "probably root url" : "")
);
if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, url)) {
prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxChecked", "1");
}
i++;
if ((maxlines >= 0) && (i >= maxlines)) break;
}
Iterator<String> iter = ranked.miss(); // iterates url hash strings
while (iter.hasNext()) {
us = (String) iter.next();
prop.put("genUrlList_urlList_"+i+"_urlExists", "0");
prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxCount", i);
prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", us);
i++;
}
prop.put("genUrlList_urlList", i);
prop.putHTML("genUrlList_keyString", keystring);
prop.put("genUrlList_count", i);
putBlacklists(prop, listManager.getDirListing(listManager.listsPath));
}
}
public static void putBlacklists(serverObjects prop, String[] lists) {
prop.put("genUrlList_blacklists", lists.length);
for (int i=0; i<lists.length; i++)
prop.put("genUrlList_blacklists_" + i + "_name", lists[i]);
}
}

@ -856,15 +856,15 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
*
* @see plasmaSwitchboard#DBPATH for the folder this file lies in
*/
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.db";
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.db";
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive1.db";
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive1.db";
/**
* <p><code>public static final String <strong>DBFILE_CRAWL_ROBOTS</strong> = "crawlRobotsTxt.db"</code></p>
* <p>Name of the file containing the database holding all <code>robots.txt</code>-entries of the lately crawled domains</p>
*
* @see plasmaSwitchboard#DBPATH for the folder this file lies in
*/
public static final String DBFILE_CRAWL_ROBOTS = "crawlRobotsTxt.db";
public static final String DBFILE_CRAWL_ROBOTS = "crawlRobotsTxt1.db";
/**
* <p><code>public static final String <strong>DBFILE_USER</strong> = "DATA/SETTINGS/user.db"</code></p>
* <p>Path to the user-DB, beginning from the YaCy-installation's top-folder. It holds all rights the created

@ -134,14 +134,13 @@ public class yacyCore {
long memDHT_time = Long.parseLong(switchboard.getConfig("ramCacheDHT_time", "1000"));
seedDB = new yacySeedDB(
sb,
new File(yacyDBPath, "seed1.new.db"),
new File(yacyDBPath, "seed1.old.db"),
new File(yacyDBPath, "seed1.pot.db"),
new File(yacyDBPath, "seed2.new.db"),
new File(yacyDBPath, "seed2.old.db"),
new File(yacyDBPath, "seed2.pot.db"),
memDHT_time);
// create or init news database
long memNews_time = Long.parseLong(switchboard.getConfig("ramCacheNews_time", "1000"));
newsPool = new yacyNewsPool(yacyDBPath, memNews_time);
newsPool = new yacyNewsPool(yacyDBPath);
loadSeedUploadMethods();

@ -50,30 +50,28 @@ import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroEcoTable;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroTree;
import de.anomic.server.serverCodings;
import de.anomic.server.serverDate;
public class yacyNewsDB {
private File path;
private long preloadTime;
protected kelondroIndex news;
public yacyNewsDB(File path, long preloadTime) {
public yacyNewsDB(File path) {
this.path = path;
this.preloadTime = preloadTime;
this.news = new kelondroCache(kelondroTree.open(path, true, preloadTime, yacyNewsRecord.rowdef));
this.news = new kelondroEcoTable(path, yacyNewsRecord.rowdef, kelondroEcoTable.tailCacheUsageAuto, 10, 0);
//this.news = new kelondroCache(kelondroTree.open(path, true, preloadTime, yacyNewsRecord.rowdef));
}
private void resetDB() {
try {close();} catch (Exception e) {}
if (path.exists()) path.delete();
this.news = new kelondroCache(kelondroTree.open(path, true, preloadTime, yacyNewsRecord.rowdef));
this.news = new kelondroEcoTable(path, yacyNewsRecord.rowdef, kelondroEcoTable.tailCacheUsageAuto, 10, 0);
}
public void close() {

@ -265,8 +265,8 @@ public class yacyNewsPool {
private int maxDistribution;
public yacyNewsPool(File yacyDBPath, long preloadTime) {
newsDB = new yacyNewsDB(new File(yacyDBPath, "news1.db"), preloadTime);
public yacyNewsPool(File yacyDBPath) {
newsDB = new yacyNewsDB(new File(yacyDBPath, "news2.db"));
outgoingNews = new yacyNewsQueue(new File(yacyDBPath, "newsOut1.stack"), newsDB);
publishedNews = new yacyNewsQueue(new File(yacyDBPath, "newsPublished1.stack"), newsDB);
incomingNews = new yacyNewsQueue(new File(yacyDBPath, "newsIn1.stack"), newsDB);

@ -627,9 +627,6 @@ ramCacheWiki_time = 500
# ram cache for blog.db
ramCacheBlog_time = 500
# ram cache for news1.db
ramCacheNews_time = 1000
# ram cache for robotsTxt.db
ramCacheRobots_time = 0

Loading…
Cancel
Save