From 40036ba69c24d9cf0e88e517b4a5679592efc8fb Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 12 Jul 2005 00:07:09 +0000 Subject: [PATCH] fixed dht transmission; added url-blacklist blocking also for remote search git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@398 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Blacklist_p.java | 8 +-- htroot/IndexControl_p.java | 8 +-- htroot/htdocsdefault/dir.java | 2 +- htroot/sharedBlacklist_p.java | 4 +- htroot/yacy/crawlOrder.java | 2 +- htroot/yacy/crawlReceipt.java | 4 +- htroot/yacy/transferURL.java | 25 +++---- source/de/anomic/data/listManager.java | 7 +- source/de/anomic/http/httpdProxyHandler.java | 4 +- source/de/anomic/plasma/plasmaCrawlLURL.java | 65 ++++++++++++++----- source/de/anomic/plasma/plasmaSearch.java | 10 +-- .../de/anomic/plasma/plasmaSwitchboard.java | 65 +++++-------------- source/de/anomic/plasma/plasmaURL.java | 13 ---- source/de/anomic/server/serverCodings.java | 15 +++++ source/de/anomic/yacy/yacyClient.java | 21 +++--- source/de/anomic/yacy/yacySearch.java | 11 ++-- yacy.logging | 2 +- 17 files changed, 132 insertions(+), 134 deletions(-) diff --git a/htroot/Blacklist_p.java b/htroot/Blacklist_p.java index b853b9b73..691adc83f 100644 --- a/htroot/Blacklist_p.java +++ b/htroot/Blacklist_p.java @@ -182,8 +182,8 @@ public class Blacklist_p { }else{ prop.put("status", 1);//removed prop.put("status_item", line); - if (listManager.switchboard.blackListURLs != null) - listManager.switchboard.blackListURLs.remove(line); + if (listManager.switchboard.urlBlacklist != null) + listManager.switchboard.urlBlacklist.remove(line); } } prop.put("Itemlist", numItems); @@ -215,8 +215,8 @@ public class Blacklist_p { prop.put("status_item", newItem);//added //add to blacklist - if (listManager.switchboard.blackListURLs != null) - listManager.switchboard.blackListURLs.put(newItem.substring(0, pos), newItem.substring(pos + 1)); + if (listManager.switchboard.urlBlacklist != null) + listManager.switchboard.urlBlacklist.add(newItem.substring(0, pos), newItem.substring(pos + 1)); } listManager.writeList(new File(listManager.listsPath, filename), out); diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index ae8e49208..567c6a627 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -164,7 +164,7 @@ public class IndexControl_p { } if (post.containsKey("urlhashdelete")) { - plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); + plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); URL url = entry.url(); if (url == null) { prop.put("result", "No Entry for url hash " + urlhash + "; nothing deleted."); @@ -230,7 +230,7 @@ public class IndexControl_p { URL url = new URL(urlstring); urlhash = plasmaURL.urlHash(url); prop.put("urlhash", urlhash); - plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); + plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); prop.put("result", genUrlProfile(switchboard, entry, urlhash)); } catch (MalformedURLException e) { prop.put("urlstring", "wrong url: " + urlstring); @@ -239,7 +239,7 @@ public class IndexControl_p { } if (post.containsKey("urlhashsearch")) { - plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); + plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); URL url = entry.url(); if (url == null) { prop.put("result", "No Entry for url hash " + urlhash); @@ -301,7 +301,7 @@ public class IndexControl_p { return prop; } - public static String genUrlProfile(plasmaSwitchboard switchboard, plasmaCrawlLURL.entry entry, String urlhash) { + public static String genUrlProfile(plasmaSwitchboard switchboard, plasmaCrawlLURL.Entry entry, String urlhash) { if (entry == null) return "No entry found for url-hash " + urlhash; URL url = entry.url(); if (url == null) return "No entry found for url-hash " + urlhash; diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java index 5586ba4e7..7af0eb114 100644 --- a/htroot/htdocsdefault/dir.java +++ b/htroot/htdocsdefault/dir.java @@ -443,7 +443,7 @@ public class dir { try { URL url = new URL(urlstring); plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes())); - plasmaCrawlLURL.entry newEntry = switchboard.urlPool.loadedURL.newEntry( + plasmaCrawlLURL.Entry newEntry = switchboard.urlPool.loadedURL.addEntry( url, "YaCyShare: " + descr, new Date(), new Date(), "____________", /*initiator*/ yacyCore.seedDB.mySeed.hash, /*executor*/ diff --git a/htroot/sharedBlacklist_p.java b/htroot/sharedBlacklist_p.java index e00bdcf5f..5672fcf28 100644 --- a/htroot/sharedBlacklist_p.java +++ b/htroot/sharedBlacklist_p.java @@ -241,8 +241,8 @@ public class sharedBlacklist_p { out += newItem+"\n"; prop.put("status_list_"+count+"_entry", newItem); count++; - if (switchboard.blackListURLs != null) - switchboard.blackListURLs.put(newItem.substring(0, pos), newItem.substring(pos + 1)); + if (switchboard.urlBlacklist != null) + switchboard.urlBlacklist.add(newItem.substring(0, pos), newItem.substring(pos + 1)); //write the list try{ diff --git a/htroot/yacy/crawlOrder.java b/htroot/yacy/crawlOrder.java index 3e889ba36..78429ff02 100644 --- a/htroot/yacy/crawlOrder.java +++ b/htroot/yacy/crawlOrder.java @@ -152,7 +152,7 @@ public class crawlOrder { reason = reasonString; delay = "" + (acceptDelay / 4); // send lurl-Entry as response - plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaCrawlLURL.urlHash(url)); + plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaCrawlLURL.urlHash(url)); if (entry != null) { response = "double"; switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare); diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index 5928d6098..142c04f24 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -111,11 +111,11 @@ public class crawlReceipt { prop.put("delay", "3600"); } else if (result.equals("fill")) { // put new data into database - switchboard.urlPool.loadedURL.newEntry(propStr, true, youare, iam, 1); + switchboard.urlPool.loadedURL.addEntry(switchboard.urlPool.loadedURL.newEntry(propStr, true), youare, iam, 1); switchboard.urlPool.noticeURL.remove(urlhash); // write log - plasmaCrawlLURL.entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); + plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); if (entry == null) { switchboard.getLog().logError("RECEIVED wrong RECEIPT for hash " + urlhash + " from peer " + iam); } else { diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index 55911e315..f8d87d084 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -49,6 +49,7 @@ import java.net.MalformedURLException; import de.anomic.http.httpHeader; import de.anomic.http.httpdProxyHandler; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyCore; @@ -73,34 +74,28 @@ public class transferURL { // response values String result = ""; String doublevalues = "0"; - URL url; if (granted) { int received = 0; int sizeBefore = switchboard.urlPool.loadedURL.size(); // read the urls from the other properties and store String urls; + plasmaCrawlLURL.Entry lEntry; for (int i = 0; i < urlc; i++) { urls = (String) post.get("url" + i); if (urls == null) { yacyCore.log.logDebug("transferURL: got null url-String from peer " + youare); } else { - try { - url = new URL(urls); - } catch (MalformedURLException e) { - yacyCore.log.logDebug("transferURL: got malformed url-String '" + urls + "' from peer " + youare); - urls = null; - url = null; - } - if ((urls != null) && (blockBlacklist)) { - if (switchboard.blacklistedURL(url.getHost().toLowerCase(), url.getPath())) { - yacyCore.log.logDebug("transferURL: blocked blacklisted url '" + urls + "' from peer " + youare); - urls = null; + lEntry = switchboard.urlPool.loadedURL.newEntry(urls, true); + if ((lEntry != null) && (blockBlacklist)) { + if (switchboard.urlBlacklist.isListed(lEntry.url().getHost().toLowerCase(), lEntry.url().getPath())) { + yacyCore.log.logDebug("transferURL: blocked blacklisted url '" + lEntry.url() + "' from peer " + youare); + lEntry = null; } } - if (urls != null) { - switchboard.urlPool.loadedURL.newEntry(urls, true, iam, iam, 3); - yacyCore.log.logDebug("transferURL: received url '" + urls + "' from peer " + youare); + if (lEntry != null) { + switchboard.urlPool.loadedURL.addEntry(lEntry, iam, iam, 3); + yacyCore.log.logDebug("transferURL: received url '" + lEntry.url() + "' from peer " + youare); received++; } } diff --git a/source/de/anomic/data/listManager.java b/source/de/anomic/data/listManager.java index 8271bf785..088ded3d2 100644 --- a/source/de/anomic/data/listManager.java +++ b/source/de/anomic/data/listManager.java @@ -233,11 +233,8 @@ public class listManager { //load all active Blacklists in the Proxy public static void reloadBlacklists(){ String f = switchboard.getConfig("proxyBlackListsActive", ""); - if (f != ""){ - switchboard.blackListURLs = switchboard.loadBlacklist("black", f, "/"); - }else{ - switchboard.blackListURLs = new TreeMap(); - } + switchboard.urlBlacklist.clear(); + if (f != "") switchboard.urlBlacklist.loadLists("black", f, "/"); } diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index 7c1a049d3..f34949aae 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -311,7 +311,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt // blacklist idea inspired by [AS]: // respond a 404 for all AGIS ("all you get is shit") servers String hostlow = host.toLowerCase(); - if (switchboard.blacklistedURL(hostlow, path)) { + if (switchboard.urlBlacklist.isListed(hostlow, path)) { httpd.sendRespondError(conProp,respond,4,403,null, "URL '" + hostlow + "' blocked by yacy proxy (blacklisted)",null); this.theLogger.logInfo("AGIS blocking of host '" + hostlow + "'"); @@ -797,7 +797,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt // check the blacklist, inspired by [AS]: respond a 404 for all AGIS (all you get is shit) servers String hostlow = host.toLowerCase(); - if (switchboard.blacklistedURL(hostlow, path)) { + if (switchboard.urlBlacklist.isListed(hostlow, path)) { try { byte[] errorMsg = ("404 (generated): URL '" + hostlow + "' blocked by yacy proxy (blacklisted)\r\n").getBytes(); httpd.sendRespondHeader(conProp,respond,httpVer,404,"Not Found (AGIS)",0); diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 0660636d0..35703de86 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -120,13 +120,13 @@ public class plasmaCrawlLURL extends plasmaURL { } - public synchronized entry newEntry(URL url, String descr, Date moddate, Date loaddate, + public synchronized Entry addEntry(URL url, String descr, Date moddate, Date loaddate, String initiatorHash, String executorHash, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, long size, int wordCount, int stackType) { - entry e = new entry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount); + Entry e = new Entry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount); if (initiatorHash == null) initiatorHash = dummyHash; if (executorHash == null) executorHash = dummyHash; switch (stackType) { @@ -137,16 +137,37 @@ public class plasmaCrawlLURL extends plasmaURL { case 4: proxyResultStack.add(e.urlHash + initiatorHash + executorHash); break; case 5: lcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break; case 6: gcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break; - } return e; } - public synchronized entry newEntry(String propStr, boolean setGlobal, String initiatorHash, String executorHash, int stackType) { + public synchronized Entry addEntry(Entry e, String initiatorHash, String executorHash, int stackType) { + if (e == null) return null; + try { + if (initiatorHash == null) initiatorHash = dummyHash; + if (executorHash == null) executorHash = dummyHash; + switch (stackType) { + case 0: break; + case 1: externResultStack.add(e.urlHash + initiatorHash + executorHash); break; + case 2: searchResultStack.add(e.urlHash + initiatorHash + executorHash); break; + case 3: transfResultStack.add(e.urlHash + initiatorHash + executorHash); break; + case 4: proxyResultStack.add(e.urlHash + initiatorHash + executorHash); break; + case 5: lcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break; + case 6: gcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break; + } + return e; + } catch (Exception ex) { + System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString()); + return null; + } + } + + /* + public synchronized Entry addEntry(String propStr, boolean setGlobal, String initiatorHash, String executorHash, int stackType) { if ((propStr.startsWith("{")) && (propStr.endsWith("}"))) { //System.out.println("DEBUG: propStr=" + propStr); try { - entry e = new entry(s2p(propStr.substring(1, propStr.length() - 1)), setGlobal); + Entry e = new Entry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)), setGlobal); if (initiatorHash == null) initiatorHash = dummyHash; if (executorHash == null) executorHash = dummyHash; switch (stackType) { @@ -157,26 +178,34 @@ public class plasmaCrawlLURL extends plasmaURL { case 4: proxyResultStack.add(e.urlHash + initiatorHash + executorHash); break; case 5: lcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break; case 6: gcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break; - } return e; - } catch (Exception e) { - System.out.println("INTERNAL ERROR in newEntry/2: " + e.toString()); + } catch (Exception ex) { + System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString()); return null; } } else { return null; } } - + */ + public void notifyGCrawl(String urlHash, String initiatorHash, String executorHash) { gcrawlResultStack.add(urlHash + initiatorHash + executorHash); } - public synchronized entry getEntry(String hash) { - return new entry(hash); + public synchronized Entry getEntry(String hash) { + return new Entry(hash); } + public synchronized Entry newEntry(String propStr, boolean setGlobal) { + if ((propStr.startsWith("{")) && (propStr.endsWith("}"))) { + return new Entry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)), setGlobal); + } else { + return null; + } + } + public int getStackSize(int stack) { switch (stack) { case 1: return externResultStack.size(); @@ -282,7 +311,7 @@ public class plasmaCrawlLURL extends plasmaURL { boolean dark = true; String urlHash, initiatorHash, executorHash; - plasmaCrawlLURL.entry urle; + plasmaCrawlLURL.Entry urle; yacySeed initiatorSeed, executorSeed; String cachepath; int c = 0; @@ -318,7 +347,7 @@ public class plasmaCrawlLURL extends plasmaURL { return prop; } - public class entry { + public class Entry { private URL url; private String descr; @@ -335,7 +364,7 @@ public class plasmaCrawlLURL extends plasmaURL { private int wordCount; private String snippet; - public entry(URL url, String descr, Date moddate, Date loaddate, + public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, long size, int wordCount) { // create new entry and store it into database @@ -356,7 +385,7 @@ public class plasmaCrawlLURL extends plasmaURL { store(); } - public entry(String urlHash) { + public Entry(String urlHash) { // generates an plasmaLURLEntry using the url hash // to speed up the access, the url-hashes are buffered // in the hash cache. @@ -389,7 +418,7 @@ public class plasmaCrawlLURL extends plasmaURL { } } - public entry(Properties prop, boolean setGlobal) { + public Entry(Properties prop, boolean setGlobal) { // generates an plasmaLURLEntry using the properties from the argument // the property names must correspond to the one from toString //System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); @@ -591,7 +620,7 @@ public class plasmaCrawlLURL extends plasmaURL { return i.hasNext(); } public Object nextElement() { - return new entry(new String((byte[]) i.next())); + return new Entry(new String((byte[]) i.next())); } } @@ -613,7 +642,7 @@ public class plasmaCrawlLURL extends plasmaURL { plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), 1); Enumeration enu = urls.elements(true, false); while (enu.hasMoreElements()) { - ((entry) enu.nextElement()).print(); + ((Entry) enu.nextElement()).print(); } } catch (Exception e) { e.printStackTrace(); diff --git a/source/de/anomic/plasma/plasmaSearch.java b/source/de/anomic/plasma/plasmaSearch.java index da5529f69..762bd7a2b 100644 --- a/source/de/anomic/plasma/plasmaSearch.java +++ b/source/de/anomic/plasma/plasmaSearch.java @@ -361,9 +361,9 @@ public final class plasmaSearch { return pageAcc.size() > 0; } - public plasmaCrawlLURL.entry nextElement() { + public plasmaCrawlLURL.Entry nextElement() { Object top = pageAcc.lastKey(); - return (plasmaCrawlLURL.entry) pageAcc.remove(top); + return (plasmaCrawlLURL.Entry) pageAcc.remove(top); } protected void addResult(plasmaWordIndexEntry indexEntry) { @@ -373,7 +373,7 @@ public final class plasmaSearch { // 2. add reference to reference sorting table // find the url entry - plasmaCrawlLURL.entry page = urlStore.getEntry(indexEntry.getUrlHash()); + plasmaCrawlLURL.Entry page = urlStore.getEntry(indexEntry.getUrlHash()); // take out relevant information for reference computation URL url = page.url(); @@ -402,7 +402,7 @@ public final class plasmaSearch { Object[] resultVector; plasmaWordIndexEntry indexEntry; - plasmaCrawlLURL.entry page; + plasmaCrawlLURL.Entry page; String[] urlcomps; String[] descrcomps; long ranking; @@ -412,7 +412,7 @@ public final class plasmaSearch { // take out values from result array resultVector = (Object[]) results.get(i); indexEntry = (plasmaWordIndexEntry) resultVector[0]; - page = (plasmaCrawlLURL.entry) resultVector[1]; + page = (plasmaCrawlLURL.Entry) resultVector[1]; urlcomps = (String[]) resultVector[2]; descrcomps = (String[]) resultVector[3]; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 075aa3cdb..8989a0b35 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -154,7 +154,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // couloured list management public static TreeSet blueList = null; public static TreeSet stopwords = null; - public static TreeMap blackListURLs = null; + public static plasmaURLPattern urlBlacklist; // storage management private File cachePath; @@ -221,12 +221,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // load the black-list / inspired by [AS] + urlBlacklist = new plasmaURLPattern(new File(getRootPath(), getConfig("listsPath", "DATA/LISTS"))); String f = getConfig("proxyBlackListsActive", null); if (f != null) { - blackListURLs = loadBlacklist("black", f, "/"); - log.logSystem("loaded black-list from file " + f + ", " + blackListURLs.size() + " entries"); - } else { - blackListURLs = new TreeMap(); + urlBlacklist.loadLists("black", f, "/"); + log.logSystem("loaded black-list from file " + f + ", " + urlBlacklist.size() + " entries"); } log.logSystem("Proxy Handler Initialized"); @@ -402,41 +401,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } - public TreeMap loadBlacklist(String mapname, String filenames, String sep) { - TreeMap map = new TreeMap(); - File listsPath = new File(getRootPath(), getConfig("listsPath", "DATA/LISTS")); - String filenamesarray[] = filenames.split(","); - - if(filenamesarray.length >0) - for(int i = 0; i < filenamesarray.length; i++) - map.putAll(serverFileUtils.loadMap(mapname, (new File(listsPath, filenamesarray[i])).toString(), sep)); - return map; - } - - public boolean blacklistedURL(String hostlow, String path) { - if (blackListURLs == null) return false; - - String pp = ""; // path-pattern - - // first try to match the domain with wildcard '*' - // [TL] While "." are found within the string - int index = 0; - while ((index = hostlow.indexOf('.', index + 1)) != -1) { - if ((pp = (String) blackListURLs.get(hostlow.substring(0, index + 1) + "*")) != null) { - return ((pp.equals("*")) || (path.substring(1).matches(pp))); - } - } - index = hostlow.length(); - while ((index = hostlow.lastIndexOf('.', index - 1)) != -1) { - if ((pp = (String) blackListURLs.get("*" + hostlow.substring(index, hostlow.length()))) != null) { - return ((pp.equals("*")) || (path.substring(1).matches(pp))); - } - } - - // try to match without wildcard in domain - return (((pp = (String) blackListURLs.get(hostlow)) != null) && - ((pp.equals("*")) || (path.substring(1).matches(pp)))); - } + private static String ppRamString(int bytes) { if (bytes < 1024) return bytes + " KByte"; @@ -1022,7 +987,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser Date lastModified = entry.responseHeader().lastModified(); if (lastModified == null) lastModified = entry.responseHeader().date(); if (lastModified == null) lastModified = new Date(); - plasmaCrawlLURL.entry newEntry = urlPool.loadedURL.newEntry( + plasmaCrawlLURL.Entry newEntry = urlPool.loadedURL.addEntry( entry.url(), descr, lastModified, new Date(), initiatorHash, yacyCore.seedDB.mySeed.hash, @@ -1176,7 +1141,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (urlhash.equals(plasmaURL.dummyHash)) return null; plasmaCrawlNURL.entry ne = urlPool.noticeURL.getEntry(urlhash); if (ne != null) return ne.url(); - plasmaCrawlLURL.entry le = urlPool.loadedURL.getEntry(urlhash); + plasmaCrawlLURL.Entry le = urlPool.loadedURL.getEntry(urlhash); if (le != null) return le.url(); plasmaCrawlEURL.entry ee = urlPool.errorURL.getEntry(urlhash); if (ee != null) return ee.url(); @@ -1267,8 +1232,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String lurl = (String) page.get("lurl"); if ((lurl != null) && (lurl.length() != 0)) { String propStr = crypt.simpleDecode(lurl, (String) page.get("key")); - plasmaCrawlLURL.entry entry = urlPool.loadedURL.newEntry(propStr, true, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); - urlPool.noticeURL.remove(entry.hash()); + plasmaCrawlLURL.Entry entry = urlPool.loadedURL.addEntry( + urlPool.loadedURL.newEntry(propStr, true), + yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); + urlPool.noticeURL.remove(entry.hash()); log.logInfo("REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + nexturlString + "). URL IS CONSIDERED AS 'LOADED!'"); return true; } else { @@ -1329,7 +1296,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public void fetchSnippets(plasmaSearch.result acc, Set queryhashes, String urlmask, int fetchcount) { // fetch the snippets int i = 0; - plasmaCrawlLURL.entry urlentry; + plasmaCrawlLURL.Entry urlentry; String urlstring; plasmaSnippetCache.result snippet; while ((acc.hasMoreElements()) && (i < fetchcount)) { @@ -1398,7 +1365,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser int fetchpeers = ((int) time / 1000) * 3; // number of target peers; means 30 peers in 10 seconds long fetchtime = time * 7 / 10; // time to waste if (fetchcount > count) fetchcount = count; - globalresults = yacySearch.searchHashes(queryhashes, urlPool.loadedURL, searchManager, fetchcount, fetchpeers, snippetCache, fetchtime); + globalresults = yacySearch.searchHashes(queryhashes, urlPool.loadedURL, searchManager, fetchcount, fetchpeers, urlBlacklist, snippetCache, fetchtime); log.logDebug("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds"); } prop.put("globalresults", globalresults); // the result are written to the local DB @@ -1425,7 +1392,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser int i = 0; int p; URL url; - plasmaCrawlLURL.entry urlentry; + plasmaCrawlLURL.Entry urlentry; String urlstring, urlname, filename; String host, hash, address, descr = ""; yacySeed seed; @@ -1551,7 +1518,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser StringBuffer links = new StringBuffer(); String resource = ""; //plasmaIndexEntry pie; - plasmaCrawlLURL.entry urlentry; + plasmaCrawlLURL.Entry urlentry; plasmaSnippetCache.result snippet; while ((acc.hasMoreElements()) && (i < count)) { urlentry = acc.nextElement(); @@ -1627,7 +1594,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // finally, delete the url entry // determine the url string - plasmaCrawlLURL.entry entry = urlPool.loadedURL.getEntry(urlhash); + plasmaCrawlLURL.Entry entry = urlPool.loadedURL.getEntry(urlhash); URL url = entry.url(); if (url == null) return 0; // get set of words diff --git a/source/de/anomic/plasma/plasmaURL.java b/source/de/anomic/plasma/plasmaURL.java index 30fb50747..7c905c36d 100644 --- a/source/de/anomic/plasma/plasmaURL.java +++ b/source/de/anomic/plasma/plasmaURL.java @@ -146,17 +146,4 @@ public class plasmaURL { return urlHashCache.rows(up, false, urlHash.getBytes()); } - protected static Properties s2p(String s) { - Properties p = new Properties(); - int pos; - StringTokenizer st = new StringTokenizer(s, ","); - String token; - while (st.hasMoreTokens()) { - token = st.nextToken().trim(); - pos = token.indexOf("="); - if (pos > 0) p.setProperty(token.substring(0, pos).trim(), token.substring(pos + 1).trim()); - } - return p; - } - } diff --git a/source/de/anomic/server/serverCodings.java b/source/de/anomic/server/serverCodings.java index fb55f34f7..48e13d092 100644 --- a/source/de/anomic/server/serverCodings.java +++ b/source/de/anomic/server/serverCodings.java @@ -45,6 +45,8 @@ import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.security.MessageDigest; +import java.util.Properties; +import java.util.StringTokenizer; public final class serverCodings { @@ -261,6 +263,19 @@ public final class serverCodings { return null; } + public static Properties s2p(String s) { + Properties p = new Properties(); + int pos; + StringTokenizer st = new StringTokenizer(s, ","); + String token; + while (st.hasMoreTokens()) { + token = st.nextToken().trim(); + pos = token.indexOf("="); + if (pos > 0) p.setProperty(token.substring(0, pos).trim(), token.substring(pos + 1).trim()); + } + return p; + } + public static void main(String[] s) { serverCodings b64 = new serverCodings(true); if (s.length == 0) {System.out.println("usage: -[ec|dc|es|ds] "); System.exit(0);} diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index dc8c965cc..17fd52a10 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -57,6 +57,7 @@ import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndexEntity; import de.anomic.plasma.plasmaWordIndexEntry; import de.anomic.plasma.plasmaWordIndexEntryContainer; +import de.anomic.plasma.plasmaURLPattern; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.tools.crypt; @@ -267,7 +268,8 @@ public class yacyClient { public static int search(String wordhashes, int count, boolean global, yacySeed targetPeer, plasmaCrawlLURL urlManager, - plasmaSearch searchManager, plasmaSnippetCache snippets, + plasmaSearch searchManager, plasmaURLPattern blacklist, + plasmaSnippetCache snippets, long duetime) { // send a search request to peer with remote Hash // this mainly converts the words into word hashes @@ -335,7 +337,7 @@ public class yacyClient { //System.out.println("yacyClient: search result = " + result.toString()); // debug int results = Integer.parseInt((String) result.get("count")); //System.out.println("***result count " + results); - plasmaCrawlLURL.entry link; + plasmaCrawlLURL.Entry link; // create containers int words = wordhashes.length() / plasmaWordIndexEntry.wordHashLength; @@ -345,9 +347,12 @@ public class yacyClient { } // insert results to containers + plasmaCrawlLURL.Entry lEntry; for (int n = 0; n < results; n++) { // get one single search result - link = urlManager.newEntry((String) result.get("resource" + n), true, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2); + lEntry = urlManager.newEntry((String) result.get("resource" + n), true); + if ((lEntry != null) && (blacklist.isListed(lEntry.url().getHost().toLowerCase(), lEntry.url().getPath()))) continue; // block with backlist + link = urlManager.addEntry(lEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2); // save the url entry plasmaWordIndexEntry entry = new plasmaWordIndexEntry(link.hash(), link.wordCount(), 0, 0, 0, plasmaSearch.calcVirtualAge(link.moddate()), link.quality(), @@ -482,7 +487,7 @@ public class yacyClient { -er crawlt, Ergebnis erscheint aber unter falschem initiator */ - public static HashMap crawlReceipt(yacySeed targetSeed, String process, String result, String reason, plasmaCrawlLURL.entry entry, String wordhashes) { + public static HashMap crawlReceipt(yacySeed targetSeed, String process, String result, String reason, plasmaCrawlLURL.Entry entry, String wordhashes) { if (targetSeed == null) return null; if (yacyCore.seedDB.mySeed == null) return null; if (yacyCore.seedDB.mySeed == targetSeed) return null; @@ -553,9 +558,9 @@ public class yacyClient { if (uhs.length == 0) return null; // all url's known // extract the urlCache from the result HashMap urlCache = (HashMap) in.get("$URLCACHE$"); - plasmaCrawlLURL.entry[] urls = new plasmaCrawlLURL.entry[uhs.length]; + plasmaCrawlLURL.Entry[] urls = new plasmaCrawlLURL.Entry[uhs.length]; for (int i = 0; i < uhs.length; i++) { - urls[i] = (plasmaCrawlLURL.entry) urlCache.get(uhs[i]); + urls[i] = (plasmaCrawlLURL.Entry) urlCache.get(uhs[i]); if (urls[i] == null) System.out.println("DEBUG transferIndex: error with requested url hash '" + uhs[i] + "', unknownURL='" + uhss + "'"); } in = transferURL(targetSeed, urls); @@ -583,7 +588,7 @@ public class yacyClient { Enumeration eenum; plasmaWordIndexEntry entry; HashMap urlCache = new HashMap(); - plasmaCrawlLURL.entry urlentry; + plasmaCrawlLURL.Entry urlentry; HashSet unknownURLs = new HashSet(); for (int i = 0; i < indexes.length; i++) { eenum = indexes[i].elements(true); @@ -646,7 +651,7 @@ public class yacyClient { } } - private static HashMap transferURL(yacySeed targetSeed, plasmaCrawlLURL.entry[] urls) { + private static HashMap transferURL(yacySeed targetSeed, plasmaCrawlLURL.Entry[] urls) { // this post a message to the remote message board String address = targetSeed.getAddress(); if (address == null) return null; diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index e4e41f4e9..5d8a07713 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -46,6 +46,7 @@ import java.util.Set; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.plasma.plasmaCrawlLURL; +import de.anomic.plasma.plasmaURLPattern; import de.anomic.plasma.plasmaSearch; import de.anomic.plasma.plasmaSnippetCache; @@ -56,19 +57,21 @@ public class yacySearch extends Thread { private boolean global; private plasmaCrawlLURL urlManager; private plasmaSearch searchManager; + private plasmaURLPattern blacklist; private plasmaSnippetCache snippetCache; private yacySeed targetPeer; private int links; private long duetime; public yacySearch(Set wordhashes, int count, boolean global, yacySeed targetPeer, - plasmaCrawlLURL urlManager, plasmaSearch searchManager, plasmaSnippetCache snippetCache, long duetime) { + plasmaCrawlLURL urlManager, plasmaSearch searchManager, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, long duetime) { super("yacySearch_" + targetPeer.getName()); this.wordhashes = wordhashes; this.count = count; this.global = global; this.urlManager = urlManager; this.searchManager = searchManager; + this.blacklist = blacklist; this.snippetCache = snippetCache; this.targetPeer = targetPeer; this.links = -1; @@ -76,7 +79,7 @@ public class yacySearch extends Thread { } public void run() { - this.links = yacyClient.search(set2string(wordhashes), count, global, targetPeer, urlManager, searchManager, snippetCache, duetime); + this.links = yacyClient.search(set2string(wordhashes), count, global, targetPeer, urlManager, searchManager, blacklist, snippetCache, duetime); if (links != 0) { //yacyCore.log.logInfo("REMOTE SEARCH - remote peer '" + targetPeer.get("Name", "anonymous") + "' contributed " + links + " links for word hash " + wordhashes); yacyCore.seedDB.mySeed.incRI(links); @@ -127,7 +130,7 @@ public class yacySearch extends Thread { } public static int searchHashes(Set wordhashes, plasmaCrawlLURL urlManager, plasmaSearch searchManager, - int count, int targets, plasmaSnippetCache snippetCache, long waitingtime) { + int count, int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, long waitingtime) { // check own peer status if ((yacyCore.seedDB.mySeed == null) || (yacyCore.seedDB.mySeed.getAddress() == null)) return 0; @@ -147,7 +150,7 @@ public class yacySearch extends Thread { yacySearch[] searchThreads = new yacySearch[targets]; for (int i = 0; i < targets; i++) { searchThreads[i]= new yacySearch(wordhashes, count, true, targetPeers[i], - urlManager, searchManager, snippetCache, duetime); + urlManager, searchManager, blacklist, snippetCache, duetime); searchThreads[i].start(); try {Thread.currentThread().sleep(20);} catch (InterruptedException e) {} if ((System.currentTimeMillis() - start) > waitingtime) { diff --git a/yacy.logging b/yacy.logging index d6be2159f..08a96ec74 100644 --- a/yacy.logging +++ b/yacy.logging @@ -12,7 +12,7 @@ # INFO regular action information (i.e. any httpd request URL) # FINEST in-function status debug output PARSER.level = INFO -YACY.level = INFO +YACY.level = FINEST HTCACHE.level = INFO PLASMA.level = FINEST SERVER.level = INFO