fixed dht transmission; added url-blacklist blocking also for remote search

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@398 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 311e627363
commit 40036ba69c

@ -182,8 +182,8 @@ public class Blacklist_p {
}else{
prop.put("status", 1);//removed
prop.put("status_item", line);
if (listManager.switchboard.blackListURLs != null)
listManager.switchboard.blackListURLs.remove(line);
if (listManager.switchboard.urlBlacklist != null)
listManager.switchboard.urlBlacklist.remove(line);
}
}
prop.put("Itemlist", numItems);
@ -215,8 +215,8 @@ public class Blacklist_p {
prop.put("status_item", newItem);//added
//add to blacklist
if (listManager.switchboard.blackListURLs != null)
listManager.switchboard.blackListURLs.put(newItem.substring(0, pos), newItem.substring(pos + 1));
if (listManager.switchboard.urlBlacklist != null)
listManager.switchboard.urlBlacklist.add(newItem.substring(0, pos), newItem.substring(pos + 1));
}
listManager.writeList(new File(listManager.listsPath, filename), out);

@ -164,7 +164,7 @@ public class IndexControl_p {
}
if (post.containsKey("urlhashdelete")) {
plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
URL url = entry.url();
if (url == null) {
prop.put("result", "No Entry for url hash " + urlhash + "; nothing deleted.");
@ -230,7 +230,7 @@ public class IndexControl_p {
URL url = new URL(urlstring);
urlhash = plasmaURL.urlHash(url);
prop.put("urlhash", urlhash);
plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
prop.put("result", genUrlProfile(switchboard, entry, urlhash));
} catch (MalformedURLException e) {
prop.put("urlstring", "wrong url: " + urlstring);
@ -239,7 +239,7 @@ public class IndexControl_p {
}
if (post.containsKey("urlhashsearch")) {
plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
URL url = entry.url();
if (url == null) {
prop.put("result", "No Entry for url hash " + urlhash);
@ -301,7 +301,7 @@ public class IndexControl_p {
return prop;
}
public static String genUrlProfile(plasmaSwitchboard switchboard, plasmaCrawlLURL.entry entry, String urlhash) {
public static String genUrlProfile(plasmaSwitchboard switchboard, plasmaCrawlLURL.Entry entry, String urlhash) {
if (entry == null) return "No entry found for url-hash " + urlhash;
URL url = entry.url();
if (url == null) return "No entry found for url-hash " + urlhash;

@ -443,7 +443,7 @@ public class dir {
try {
URL url = new URL(urlstring);
plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()));
plasmaCrawlLURL.entry newEntry = switchboard.urlPool.loadedURL.newEntry(
plasmaCrawlLURL.Entry newEntry = switchboard.urlPool.loadedURL.addEntry(
url, "YaCyShare: " + descr, new Date(), new Date(),
"____________", /*initiator*/
yacyCore.seedDB.mySeed.hash, /*executor*/

@ -241,8 +241,8 @@ public class sharedBlacklist_p {
out += newItem+"\n";
prop.put("status_list_"+count+"_entry", newItem);
count++;
if (switchboard.blackListURLs != null)
switchboard.blackListURLs.put(newItem.substring(0, pos), newItem.substring(pos + 1));
if (switchboard.urlBlacklist != null)
switchboard.urlBlacklist.add(newItem.substring(0, pos), newItem.substring(pos + 1));
//write the list
try{

@ -152,7 +152,7 @@ public class crawlOrder {
reason = reasonString;
delay = "" + (acceptDelay / 4);
// send lurl-Entry as response
plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaCrawlLURL.urlHash(url));
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaCrawlLURL.urlHash(url));
if (entry != null) {
response = "double";
switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare);

@ -111,11 +111,11 @@ public class crawlReceipt {
prop.put("delay", "3600");
} else if (result.equals("fill")) {
// put new data into database
switchboard.urlPool.loadedURL.newEntry(propStr, true, youare, iam, 1);
switchboard.urlPool.loadedURL.addEntry(switchboard.urlPool.loadedURL.newEntry(propStr, true), youare, iam, 1);
switchboard.urlPool.noticeURL.remove(urlhash);
// write log
plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
if (entry == null) {
switchboard.getLog().logError("RECEIVED wrong RECEIPT for hash " + urlhash + " from peer " + iam);
} else {

@ -49,6 +49,7 @@ import java.net.MalformedURLException;
import de.anomic.http.httpHeader;
import de.anomic.http.httpdProxyHandler;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
@ -73,34 +74,28 @@ public class transferURL {
// response values
String result = "";
String doublevalues = "0";
URL url;
if (granted) {
int received = 0;
int sizeBefore = switchboard.urlPool.loadedURL.size();
// read the urls from the other properties and store
String urls;
plasmaCrawlLURL.Entry lEntry;
for (int i = 0; i < urlc; i++) {
urls = (String) post.get("url" + i);
if (urls == null) {
yacyCore.log.logDebug("transferURL: got null url-String from peer " + youare);
} else {
try {
url = new URL(urls);
} catch (MalformedURLException e) {
yacyCore.log.logDebug("transferURL: got malformed url-String '" + urls + "' from peer " + youare);
urls = null;
url = null;
}
if ((urls != null) && (blockBlacklist)) {
if (switchboard.blacklistedURL(url.getHost().toLowerCase(), url.getPath())) {
yacyCore.log.logDebug("transferURL: blocked blacklisted url '" + urls + "' from peer " + youare);
urls = null;
lEntry = switchboard.urlPool.loadedURL.newEntry(urls, true);
if ((lEntry != null) && (blockBlacklist)) {
if (switchboard.urlBlacklist.isListed(lEntry.url().getHost().toLowerCase(), lEntry.url().getPath())) {
yacyCore.log.logDebug("transferURL: blocked blacklisted url '" + lEntry.url() + "' from peer " + youare);
lEntry = null;
}
}
if (urls != null) {
switchboard.urlPool.loadedURL.newEntry(urls, true, iam, iam, 3);
yacyCore.log.logDebug("transferURL: received url '" + urls + "' from peer " + youare);
if (lEntry != null) {
switchboard.urlPool.loadedURL.addEntry(lEntry, iam, iam, 3);
yacyCore.log.logDebug("transferURL: received url '" + lEntry.url() + "' from peer " + youare);
received++;
}
}

@ -233,11 +233,8 @@ public class listManager {
//load all active Blacklists in the Proxy
public static void reloadBlacklists(){
String f = switchboard.getConfig("proxyBlackListsActive", "");
if (f != ""){
switchboard.blackListURLs = switchboard.loadBlacklist("black", f, "/");
}else{
switchboard.blackListURLs = new TreeMap();
}
switchboard.urlBlacklist.clear();
if (f != "") switchboard.urlBlacklist.loadLists("black", f, "/");
}

@ -311,7 +311,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
// blacklist idea inspired by [AS]:
// respond a 404 for all AGIS ("all you get is shit") servers
String hostlow = host.toLowerCase();
if (switchboard.blacklistedURL(hostlow, path)) {
if (switchboard.urlBlacklist.isListed(hostlow, path)) {
httpd.sendRespondError(conProp,respond,4,403,null,
"URL '" + hostlow + "' blocked by yacy proxy (blacklisted)",null);
this.theLogger.logInfo("AGIS blocking of host '" + hostlow + "'");
@ -797,7 +797,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
// check the blacklist, inspired by [AS]: respond a 404 for all AGIS (all you get is shit) servers
String hostlow = host.toLowerCase();
if (switchboard.blacklistedURL(hostlow, path)) {
if (switchboard.urlBlacklist.isListed(hostlow, path)) {
try {
byte[] errorMsg = ("404 (generated): URL '" + hostlow + "' blocked by yacy proxy (blacklisted)\r\n").getBytes();
httpd.sendRespondHeader(conProp,respond,httpVer,404,"Not Found (AGIS)",0);

@ -120,13 +120,13 @@ public class plasmaCrawlLURL extends plasmaURL {
}
public synchronized entry newEntry(URL url, String descr, Date moddate, Date loaddate,
public synchronized Entry addEntry(URL url, String descr, Date moddate, Date loaddate,
String initiatorHash, String executorHash,
String referrerHash, int copyCount, boolean localNeed,
int quality, String language, char doctype,
long size, int wordCount,
int stackType) {
entry e = new entry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount);
Entry e = new Entry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount);
if (initiatorHash == null) initiatorHash = dummyHash;
if (executorHash == null) executorHash = dummyHash;
switch (stackType) {
@ -137,16 +137,37 @@ public class plasmaCrawlLURL extends plasmaURL {
case 4: proxyResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 5: lcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 6: gcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break;
}
return e;
}
public synchronized entry newEntry(String propStr, boolean setGlobal, String initiatorHash, String executorHash, int stackType) {
public synchronized Entry addEntry(Entry e, String initiatorHash, String executorHash, int stackType) {
if (e == null) return null;
try {
if (initiatorHash == null) initiatorHash = dummyHash;
if (executorHash == null) executorHash = dummyHash;
switch (stackType) {
case 0: break;
case 1: externResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 2: searchResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 3: transfResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 4: proxyResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 5: lcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 6: gcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break;
}
return e;
} catch (Exception ex) {
System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString());
return null;
}
}
/*
public synchronized Entry addEntry(String propStr, boolean setGlobal, String initiatorHash, String executorHash, int stackType) {
if ((propStr.startsWith("{")) && (propStr.endsWith("}"))) {
//System.out.println("DEBUG: propStr=" + propStr);
try {
entry e = new entry(s2p(propStr.substring(1, propStr.length() - 1)), setGlobal);
Entry e = new Entry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)), setGlobal);
if (initiatorHash == null) initiatorHash = dummyHash;
if (executorHash == null) executorHash = dummyHash;
switch (stackType) {
@ -157,26 +178,34 @@ public class plasmaCrawlLURL extends plasmaURL {
case 4: proxyResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 5: lcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break;
case 6: gcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break;
}
return e;
} catch (Exception e) {
System.out.println("INTERNAL ERROR in newEntry/2: " + e.toString());
} catch (Exception ex) {
System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString());
return null;
}
} else {
return null;
}
}
*/
public void notifyGCrawl(String urlHash, String initiatorHash, String executorHash) {
gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
}
public synchronized entry getEntry(String hash) {
return new entry(hash);
public synchronized Entry getEntry(String hash) {
return new Entry(hash);
}
public synchronized Entry newEntry(String propStr, boolean setGlobal) {
if ((propStr.startsWith("{")) && (propStr.endsWith("}"))) {
return new Entry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)), setGlobal);
} else {
return null;
}
}
public int getStackSize(int stack) {
switch (stack) {
case 1: return externResultStack.size();
@ -282,7 +311,7 @@ public class plasmaCrawlLURL extends plasmaURL {
boolean dark = true;
String urlHash, initiatorHash, executorHash;
plasmaCrawlLURL.entry urle;
plasmaCrawlLURL.Entry urle;
yacySeed initiatorSeed, executorSeed;
String cachepath;
int c = 0;
@ -318,7 +347,7 @@ public class plasmaCrawlLURL extends plasmaURL {
return prop;
}
public class entry {
public class Entry {
private URL url;
private String descr;
@ -335,7 +364,7 @@ public class plasmaCrawlLURL extends plasmaURL {
private int wordCount;
private String snippet;
public entry(URL url, String descr, Date moddate, Date loaddate,
public Entry(URL url, String descr, Date moddate, Date loaddate,
String referrerHash, int copyCount, boolean localNeed,
int quality, String language, char doctype, long size, int wordCount) {
// create new entry and store it into database
@ -356,7 +385,7 @@ public class plasmaCrawlLURL extends plasmaURL {
store();
}
public entry(String urlHash) {
public Entry(String urlHash) {
// generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
@ -389,7 +418,7 @@ public class plasmaCrawlLURL extends plasmaURL {
}
}
public entry(Properties prop, boolean setGlobal) {
public Entry(Properties prop, boolean setGlobal) {
// generates an plasmaLURLEntry using the properties from the argument
// the property names must correspond to the one from toString
//System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
@ -591,7 +620,7 @@ public class plasmaCrawlLURL extends plasmaURL {
return i.hasNext();
}
public Object nextElement() {
return new entry(new String((byte[]) i.next()));
return new Entry(new String((byte[]) i.next()));
}
}
@ -613,7 +642,7 @@ public class plasmaCrawlLURL extends plasmaURL {
plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), 1);
Enumeration enu = urls.elements(true, false);
while (enu.hasMoreElements()) {
((entry) enu.nextElement()).print();
((Entry) enu.nextElement()).print();
}
} catch (Exception e) {
e.printStackTrace();

@ -361,9 +361,9 @@ public final class plasmaSearch {
return pageAcc.size() > 0;
}
public plasmaCrawlLURL.entry nextElement() {
public plasmaCrawlLURL.Entry nextElement() {
Object top = pageAcc.lastKey();
return (plasmaCrawlLURL.entry) pageAcc.remove(top);
return (plasmaCrawlLURL.Entry) pageAcc.remove(top);
}
protected void addResult(plasmaWordIndexEntry indexEntry) {
@ -373,7 +373,7 @@ public final class plasmaSearch {
// 2. add reference to reference sorting table
// find the url entry
plasmaCrawlLURL.entry page = urlStore.getEntry(indexEntry.getUrlHash());
plasmaCrawlLURL.Entry page = urlStore.getEntry(indexEntry.getUrlHash());
// take out relevant information for reference computation
URL url = page.url();
@ -402,7 +402,7 @@ public final class plasmaSearch {
Object[] resultVector;
plasmaWordIndexEntry indexEntry;
plasmaCrawlLURL.entry page;
plasmaCrawlLURL.Entry page;
String[] urlcomps;
String[] descrcomps;
long ranking;
@ -412,7 +412,7 @@ public final class plasmaSearch {
// take out values from result array
resultVector = (Object[]) results.get(i);
indexEntry = (plasmaWordIndexEntry) resultVector[0];
page = (plasmaCrawlLURL.entry) resultVector[1];
page = (plasmaCrawlLURL.Entry) resultVector[1];
urlcomps = (String[]) resultVector[2];
descrcomps = (String[]) resultVector[3];

@ -154,7 +154,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// couloured list management
public static TreeSet blueList = null;
public static TreeSet stopwords = null;
public static TreeMap blackListURLs = null;
public static plasmaURLPattern urlBlacklist;
// storage management
private File cachePath;
@ -221,12 +221,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// load the black-list / inspired by [AS]
urlBlacklist = new plasmaURLPattern(new File(getRootPath(), getConfig("listsPath", "DATA/LISTS")));
String f = getConfig("proxyBlackListsActive", null);
if (f != null) {
blackListURLs = loadBlacklist("black", f, "/");
log.logSystem("loaded black-list from file " + f + ", " + blackListURLs.size() + " entries");
} else {
blackListURLs = new TreeMap();
urlBlacklist.loadLists("black", f, "/");
log.logSystem("loaded black-list from file " + f + ", " + urlBlacklist.size() + " entries");
}
log.logSystem("Proxy Handler Initialized");
@ -402,41 +401,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public TreeMap loadBlacklist(String mapname, String filenames, String sep) {
TreeMap map = new TreeMap();
File listsPath = new File(getRootPath(), getConfig("listsPath", "DATA/LISTS"));
String filenamesarray[] = filenames.split(",");
if(filenamesarray.length >0)
for(int i = 0; i < filenamesarray.length; i++)
map.putAll(serverFileUtils.loadMap(mapname, (new File(listsPath, filenamesarray[i])).toString(), sep));
return map;
}
public boolean blacklistedURL(String hostlow, String path) {
if (blackListURLs == null) return false;
String pp = ""; // path-pattern
// first try to match the domain with wildcard '*'
// [TL] While "." are found within the string
int index = 0;
while ((index = hostlow.indexOf('.', index + 1)) != -1) {
if ((pp = (String) blackListURLs.get(hostlow.substring(0, index + 1) + "*")) != null) {
return ((pp.equals("*")) || (path.substring(1).matches(pp)));
}
}
index = hostlow.length();
while ((index = hostlow.lastIndexOf('.', index - 1)) != -1) {
if ((pp = (String) blackListURLs.get("*" + hostlow.substring(index, hostlow.length()))) != null) {
return ((pp.equals("*")) || (path.substring(1).matches(pp)));
}
}
// try to match without wildcard in domain
return (((pp = (String) blackListURLs.get(hostlow)) != null) &&
((pp.equals("*")) || (path.substring(1).matches(pp))));
}
private static String ppRamString(int bytes) {
if (bytes < 1024) return bytes + " KByte";
@ -1022,7 +987,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
Date lastModified = entry.responseHeader().lastModified();
if (lastModified == null) lastModified = entry.responseHeader().date();
if (lastModified == null) lastModified = new Date();
plasmaCrawlLURL.entry newEntry = urlPool.loadedURL.newEntry(
plasmaCrawlLURL.Entry newEntry = urlPool.loadedURL.addEntry(
entry.url(), descr, lastModified, new Date(),
initiatorHash,
yacyCore.seedDB.mySeed.hash,
@ -1176,7 +1141,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (urlhash.equals(plasmaURL.dummyHash)) return null;
plasmaCrawlNURL.entry ne = urlPool.noticeURL.getEntry(urlhash);
if (ne != null) return ne.url();
plasmaCrawlLURL.entry le = urlPool.loadedURL.getEntry(urlhash);
plasmaCrawlLURL.Entry le = urlPool.loadedURL.getEntry(urlhash);
if (le != null) return le.url();
plasmaCrawlEURL.entry ee = urlPool.errorURL.getEntry(urlhash);
if (ee != null) return ee.url();
@ -1267,8 +1232,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String lurl = (String) page.get("lurl");
if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
plasmaCrawlLURL.entry entry = urlPool.loadedURL.newEntry(propStr, true, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1);
urlPool.noticeURL.remove(entry.hash());
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.addEntry(
urlPool.loadedURL.newEntry(propStr, true),
yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1);
urlPool.noticeURL.remove(entry.hash());
log.logInfo("REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + nexturlString + "). URL IS CONSIDERED AS 'LOADED!'");
return true;
} else {
@ -1329,7 +1296,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public void fetchSnippets(plasmaSearch.result acc, Set queryhashes, String urlmask, int fetchcount) {
// fetch the snippets
int i = 0;
plasmaCrawlLURL.entry urlentry;
plasmaCrawlLURL.Entry urlentry;
String urlstring;
plasmaSnippetCache.result snippet;
while ((acc.hasMoreElements()) && (i < fetchcount)) {
@ -1398,7 +1365,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
int fetchpeers = ((int) time / 1000) * 3; // number of target peers; means 30 peers in 10 seconds
long fetchtime = time * 7 / 10; // time to waste
if (fetchcount > count) fetchcount = count;
globalresults = yacySearch.searchHashes(queryhashes, urlPool.loadedURL, searchManager, fetchcount, fetchpeers, snippetCache, fetchtime);
globalresults = yacySearch.searchHashes(queryhashes, urlPool.loadedURL, searchManager, fetchcount, fetchpeers, urlBlacklist, snippetCache, fetchtime);
log.logDebug("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
}
prop.put("globalresults", globalresults); // the result are written to the local DB
@ -1425,7 +1392,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
int i = 0;
int p;
URL url;
plasmaCrawlLURL.entry urlentry;
plasmaCrawlLURL.Entry urlentry;
String urlstring, urlname, filename;
String host, hash, address, descr = "";
yacySeed seed;
@ -1551,7 +1518,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
StringBuffer links = new StringBuffer();
String resource = "";
//plasmaIndexEntry pie;
plasmaCrawlLURL.entry urlentry;
plasmaCrawlLURL.Entry urlentry;
plasmaSnippetCache.result snippet;
while ((acc.hasMoreElements()) && (i < count)) {
urlentry = acc.nextElement();
@ -1627,7 +1594,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// finally, delete the url entry
// determine the url string
plasmaCrawlLURL.entry entry = urlPool.loadedURL.getEntry(urlhash);
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.getEntry(urlhash);
URL url = entry.url();
if (url == null) return 0;
// get set of words

@ -146,17 +146,4 @@ public class plasmaURL {
return urlHashCache.rows(up, false, urlHash.getBytes());
}
protected static Properties s2p(String s) {
Properties p = new Properties();
int pos;
StringTokenizer st = new StringTokenizer(s, ",");
String token;
while (st.hasMoreTokens()) {
token = st.nextToken().trim();
pos = token.indexOf("=");
if (pos > 0) p.setProperty(token.substring(0, pos).trim(), token.substring(pos + 1).trim());
}
return p;
}
}

@ -45,6 +45,8 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.security.MessageDigest;
import java.util.Properties;
import java.util.StringTokenizer;
public final class serverCodings {
@ -261,6 +263,19 @@ public final class serverCodings {
return null;
}
public static Properties s2p(String s) {
Properties p = new Properties();
int pos;
StringTokenizer st = new StringTokenizer(s, ",");
String token;
while (st.hasMoreTokens()) {
token = st.nextToken().trim();
pos = token.indexOf("=");
if (pos > 0) p.setProperty(token.substring(0, pos).trim(), token.substring(pos + 1).trim());
}
return p;
}
public static void main(String[] s) {
serverCodings b64 = new serverCodings(true);
if (s.length == 0) {System.out.println("usage: -[ec|dc|es|ds] <arg>"); System.exit(0);}

@ -57,6 +57,7 @@ import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.plasma.plasmaURLPattern;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.tools.crypt;
@ -267,7 +268,8 @@ public class yacyClient {
public static int search(String wordhashes, int count, boolean global,
yacySeed targetPeer, plasmaCrawlLURL urlManager,
plasmaSearch searchManager, plasmaSnippetCache snippets,
plasmaSearch searchManager, plasmaURLPattern blacklist,
plasmaSnippetCache snippets,
long duetime) {
// send a search request to peer with remote Hash
// this mainly converts the words into word hashes
@ -335,7 +337,7 @@ public class yacyClient {
//System.out.println("yacyClient: search result = " + result.toString()); // debug
int results = Integer.parseInt((String) result.get("count"));
//System.out.println("***result count " + results);
plasmaCrawlLURL.entry link;
plasmaCrawlLURL.Entry link;
// create containers
int words = wordhashes.length() / plasmaWordIndexEntry.wordHashLength;
@ -345,9 +347,12 @@ public class yacyClient {
}
// insert results to containers
plasmaCrawlLURL.Entry lEntry;
for (int n = 0; n < results; n++) {
// get one single search result
link = urlManager.newEntry((String) result.get("resource" + n), true, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
lEntry = urlManager.newEntry((String) result.get("resource" + n), true);
if ((lEntry != null) && (blacklist.isListed(lEntry.url().getHost().toLowerCase(), lEntry.url().getPath()))) continue; // block with backlist
link = urlManager.addEntry(lEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
// save the url entry
plasmaWordIndexEntry entry = new plasmaWordIndexEntry(link.hash(), link.wordCount(), 0, 0, 0,
plasmaSearch.calcVirtualAge(link.moddate()), link.quality(),
@ -482,7 +487,7 @@ public class yacyClient {
-er crawlt, Ergebnis erscheint aber unter falschem initiator
*/
public static HashMap crawlReceipt(yacySeed targetSeed, String process, String result, String reason, plasmaCrawlLURL.entry entry, String wordhashes) {
public static HashMap crawlReceipt(yacySeed targetSeed, String process, String result, String reason, plasmaCrawlLURL.Entry entry, String wordhashes) {
if (targetSeed == null) return null;
if (yacyCore.seedDB.mySeed == null) return null;
if (yacyCore.seedDB.mySeed == targetSeed) return null;
@ -553,9 +558,9 @@ public class yacyClient {
if (uhs.length == 0) return null; // all url's known
// extract the urlCache from the result
HashMap urlCache = (HashMap) in.get("$URLCACHE$");
plasmaCrawlLURL.entry[] urls = new plasmaCrawlLURL.entry[uhs.length];
plasmaCrawlLURL.Entry[] urls = new plasmaCrawlLURL.Entry[uhs.length];
for (int i = 0; i < uhs.length; i++) {
urls[i] = (plasmaCrawlLURL.entry) urlCache.get(uhs[i]);
urls[i] = (plasmaCrawlLURL.Entry) urlCache.get(uhs[i]);
if (urls[i] == null) System.out.println("DEBUG transferIndex: error with requested url hash '" + uhs[i] + "', unknownURL='" + uhss + "'");
}
in = transferURL(targetSeed, urls);
@ -583,7 +588,7 @@ public class yacyClient {
Enumeration eenum;
plasmaWordIndexEntry entry;
HashMap urlCache = new HashMap();
plasmaCrawlLURL.entry urlentry;
plasmaCrawlLURL.Entry urlentry;
HashSet unknownURLs = new HashSet();
for (int i = 0; i < indexes.length; i++) {
eenum = indexes[i].elements(true);
@ -646,7 +651,7 @@ public class yacyClient {
}
}
private static HashMap transferURL(yacySeed targetSeed, plasmaCrawlLURL.entry[] urls) {
private static HashMap transferURL(yacySeed targetSeed, plasmaCrawlLURL.Entry[] urls) {
// this post a message to the remote message board
String address = targetSeed.getAddress();
if (address == null) return null;

@ -46,6 +46,7 @@ import java.util.Set;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaURLPattern;
import de.anomic.plasma.plasmaSearch;
import de.anomic.plasma.plasmaSnippetCache;
@ -56,19 +57,21 @@ public class yacySearch extends Thread {
private boolean global;
private plasmaCrawlLURL urlManager;
private plasmaSearch searchManager;
private plasmaURLPattern blacklist;
private plasmaSnippetCache snippetCache;
private yacySeed targetPeer;
private int links;
private long duetime;
public yacySearch(Set wordhashes, int count, boolean global, yacySeed targetPeer,
plasmaCrawlLURL urlManager, plasmaSearch searchManager, plasmaSnippetCache snippetCache, long duetime) {
plasmaCrawlLURL urlManager, plasmaSearch searchManager, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, long duetime) {
super("yacySearch_" + targetPeer.getName());
this.wordhashes = wordhashes;
this.count = count;
this.global = global;
this.urlManager = urlManager;
this.searchManager = searchManager;
this.blacklist = blacklist;
this.snippetCache = snippetCache;
this.targetPeer = targetPeer;
this.links = -1;
@ -76,7 +79,7 @@ public class yacySearch extends Thread {
}
public void run() {
this.links = yacyClient.search(set2string(wordhashes), count, global, targetPeer, urlManager, searchManager, snippetCache, duetime);
this.links = yacyClient.search(set2string(wordhashes), count, global, targetPeer, urlManager, searchManager, blacklist, snippetCache, duetime);
if (links != 0) {
//yacyCore.log.logInfo("REMOTE SEARCH - remote peer '" + targetPeer.get("Name", "anonymous") + "' contributed " + links + " links for word hash " + wordhashes);
yacyCore.seedDB.mySeed.incRI(links);
@ -127,7 +130,7 @@ public class yacySearch extends Thread {
}
public static int searchHashes(Set wordhashes, plasmaCrawlLURL urlManager, plasmaSearch searchManager,
int count, int targets, plasmaSnippetCache snippetCache, long waitingtime) {
int count, int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, long waitingtime) {
// check own peer status
if ((yacyCore.seedDB.mySeed == null) || (yacyCore.seedDB.mySeed.getAddress() == null)) return 0;
@ -147,7 +150,7 @@ public class yacySearch extends Thread {
yacySearch[] searchThreads = new yacySearch[targets];
for (int i = 0; i < targets; i++) {
searchThreads[i]= new yacySearch(wordhashes, count, true, targetPeers[i],
urlManager, searchManager, snippetCache, duetime);
urlManager, searchManager, blacklist, snippetCache, duetime);
searchThreads[i].start();
try {Thread.currentThread().sleep(20);} catch (InterruptedException e) {}
if ((System.currentTimeMillis() - start) > waitingtime) {

@ -12,7 +12,7 @@
# INFO regular action information (i.e. any httpd request URL)
# FINEST in-function status debug output
PARSER.level = INFO
YACY.level = INFO
YACY.level = FINEST
HTCACHE.level = INFO
PLASMA.level = FINEST
SERVER.level = INFO

Loading…
Cancel
Save