diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index e2be76047..717a8dca7 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -307,7 +307,7 @@ public class IndexCreate_p { prop.put("indexing-queue_list_"+i+"_modified", daydate(pcentry.lastModified)); prop.put("indexing-queue_list_"+i+"_href",((pcentry.scraper == null) ? "0" : ("" + pcentry.scraper.getAnchors().size()))); prop.put("indexing-queue_list_"+i+"_anchor", ((pcentry.scraper == null) ? "-" : pcentry.scraper.getHeadline()) ); - prop.put("indexing-queue_list_"+i+"_url", pcentry.urlString); + prop.put("indexing-queue_list_"+i+"_url", pcentry.nomalizedURLString); dark = !dark; } } diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java index a1126738a..df43ea799 100644 --- a/htroot/htdocsdefault/dir.java +++ b/htroot/htdocsdefault/dir.java @@ -456,7 +456,7 @@ public class dir { public static void deletePhrase(plasmaSwitchboard switchboard, String urlstring, String phrase, String descr) { try { String urlhash = plasmaURL.urlHash(new URL(urlstring)); - Set words = plasmaSwitchboard.getWords(("yacyshare " + phrase + " " + descr).getBytes()); + Set words = plasmaCondenser.getWords(("yacyshare " + phrase + " " + descr).getBytes()); switchboard.removeReferences(urlhash, words); switchboard.loadedURL.remove(urlhash); } catch (Exception e) { diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index 70069b9ed..b210a1e04 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -122,7 +122,10 @@ public class transferRWI { Iterator it = unknownURL.iterator(); while (it.hasNext()) unknownURLs += "," + (String) it.next(); if (unknownURLs.length() > 0) unknownURLs = unknownURLs.substring(1); - switchboard.log.logInfo("Received " + received + " Words [" + wordhashes[0] + " .. " + wordhashes[wordhashes.length - 1] + "] from peer " + iam + ", requested " + unknownURL.size() + " URL's"); + if (wordhashes.length == 0) + switchboard.log.logInfo("Received 0 Words from peer " + iam + ", requested " + unknownURL.size() + " URL's"); + else + switchboard.log.logInfo("Received " + received + " Words [" + wordhashes[0] + " .. " + wordhashes[wordhashes.length - 1] + "] from peer " + iam + ", requested " + unknownURL.size() + " URL's"); result = "ok"; } else { result = "error_not_granted"; diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index 075f60e8a..382f7bd76 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -583,6 +583,17 @@ public class plasmaCondenser { } } + public static Set getWords(byte[] text) { + if (text == null) return null; + ByteArrayInputStream buffer = new ByteArrayInputStream(text); + try { + plasmaCondenser condenser = new plasmaCondenser(buffer); + return condenser.getWords(); + } catch (IOException e) { + return null; + } + } + public static void main(String[] args) { if ((args.length == 0) || (args.length > 3)) System.out.println("wrong number of arguments: plasmaCondenser -text|-html "); else try { diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index e358acdaf..f8d20b2cd 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -238,7 +238,7 @@ public class plasmaHTCache { if ((entry.status == CACHE_FILL) || (entry.status == CACHE_STALE_RELOAD_GOOD) || (entry.status == CACHE_STALE_RELOAD_BAD)) { - responseHeaderDB.set(entry.urlHash, entry.responseHeader); + responseHeaderDB.set(entry.nomalizedURLHash, entry.responseHeader); } // work off unwritten files and undone parsing @@ -254,7 +254,7 @@ public class plasmaHTCache { } entry.cacheFile.getParentFile().mkdirs(); serverFileUtils.write(entry.cacheArray, entry.cacheFile); - entry.cacheArray = null; + //entry.cacheArray = null; } catch (FileNotFoundException e) { // this is the case of a "(Not a directory)" error, which should be prohibited // by the shallStoreCache() property. However, sometimes the error still occurs @@ -444,8 +444,8 @@ public class plasmaHTCache { public File cacheFile; // the cache file public byte[] cacheArray; // or the cache as byte-array public URL url; - public String urlHash; - public String urlString; + public String nomalizedURLHash; + public String nomalizedURLString; public int status; // cache load/hit/stale etc status public Date lastModified; public char doctype; @@ -462,15 +462,15 @@ public class plasmaHTCache { plasmaCrawlProfile.entry profile) { // normalize url - this.urlString = htmlFilterContentScraper.urlNormalform(url); + this.nomalizedURLString = htmlFilterContentScraper.urlNormalform(url); try { - this.url = new URL(urlString); + this.url = new URL(nomalizedURLString); } catch (MalformedURLException e) { System.out.println("internal error at httpdProxyCache.Entry: " + e); System.exit(-1); } this.cacheFile = getCachePath(this.url); - this.urlHash = plasmaCrawlLURL.urlHash(urlString); + this.nomalizedURLHash = plasmaCrawlLURL.urlHash(nomalizedURLString); // assigned: this.initDate = initDate; @@ -496,7 +496,7 @@ public class plasmaHTCache { lastModified = responseHeader.lastModified(); if (lastModified == null) lastModified = new Date(); // does not exist in header } - this.doctype = plasmaWordIndexEntry.docType(urlString); + this.doctype = plasmaWordIndexEntry.docType(nomalizedURLString); this.language = plasmaWordIndexEntry.language(url); // to be defined later: @@ -554,8 +554,8 @@ public class plasmaHTCache { // -CGI access in request // CGI access makes the page very individual, and therefore not usable in caches - if ((isPOST(urlString)) && (!(profile.crawlingQ()))) return "dynamic_post"; - if (isCGI(urlString)) return "dynamic_cgi"; + if ((isPOST(nomalizedURLString)) && (!(profile.crawlingQ()))) return "dynamic_post"; + if (isCGI(nomalizedURLString)) return "dynamic_cgi"; // -authorization cases in request // authorization makes pages very individual, and therefore we cannot use the @@ -622,8 +622,8 @@ public class plasmaHTCache { // -CGI access in request // CGI access makes the page very individual, and therefore not usable in caches - if (isPOST(urlString)) return false; - if (isCGI(urlString)) return false; + if (isPOST(nomalizedURLString)) return false; + if (isCGI(nomalizedURLString)) return false; // -authorization cases in request if (requestHeader.containsKey("AUTHORIZATION")) return false; @@ -747,8 +747,8 @@ public class plasmaHTCache { // -CGI access in request // CGI access makes the page very individual, and therefore not usable in caches - if ((isPOST(urlString)) && (!(profile.crawlingQ()))) return "Dynamic_(POST)"; - if ((isCGI(urlString)) && (!(profile.crawlingQ()))) return "Dynamic_(CGI)"; + if ((isPOST(nomalizedURLString)) && (!(profile.crawlingQ()))) return "Dynamic_(POST)"; + if ((isCGI(nomalizedURLString)) && (!(profile.crawlingQ()))) return "Dynamic_(CGI)"; // -authorization cases in request // we checked that in shallStoreCache @@ -759,7 +759,7 @@ public class plasmaHTCache { // a picture cannot be indexed if (isPicture(responseHeader)) return "Media_Content_(Picture)"; if (!(isText(responseHeader))) return "Media_Content_(not_text)"; - if (noIndexingURL(urlString)) return "Media_Content_(forbidden)"; + if (noIndexingURL(nomalizedURLString)) return "Media_Content_(forbidden)"; // -if-modified-since in request @@ -864,8 +864,8 @@ public class plasmaHTCache { // -CGI access in request // CGI access makes the page very individual, and therefore not usable in caches - if ((isPOST(urlString)) && (!(profile.crawlingQ()))) return "Dynamic_(POST)"; - if ((isCGI(urlString)) && (!(profile.crawlingQ()))) return "Dynamic_(CGI)"; + if ((isPOST(nomalizedURLString)) && (!(profile.crawlingQ()))) return "Dynamic_(POST)"; + if ((isCGI(nomalizedURLString)) && (!(profile.crawlingQ()))) return "Dynamic_(CGI)"; // -authorization cases in request // we checked that in shallStoreCache @@ -876,7 +876,7 @@ public class plasmaHTCache { // a picture cannot be indexed if (isPicture(responseHeader)) return "Media_Content_(Picture)"; if (!(isText(responseHeader))) return "Media_Content_(not_text)"; - if (noIndexingURL(urlString)) return "Media_Content_(forbidden)"; + if (noIndexingURL(nomalizedURLString)) return "Media_Content_(forbidden)"; // -if-modified-since in request // if the page is fresh at the very moment we can index it diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 934ad3bec..cc221dc4c 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -393,7 +393,6 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi public synchronized void deQueue() { if (serverJobs < 5) { if (processStack.size() > 0) { - log.logDebug("DEQUEUE: dequeueing one step (processStack=" + processStack.size() + ", localStackSize=" + noticeURL.localStackSize() + ", remoteStackSize=" + noticeURL.remoteStackSize() + ")"); processResourceStack((plasmaHTCache.Entry) processStack.removeFirst()); } } else { @@ -469,7 +468,13 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi private synchronized void processResourceStack(plasmaHTCache.Entry entry) { // work off one stack entry with a fresh resource (scraped web page) - if ((entry.cacheArray != null) || (entry.scraper != null)) try { + String stats = "DEQUEUE: dequeueing one step (processStack=" + processStack.size() + ", localStackSize=" + noticeURL.localStackSize() + ", remoteStackSize=" + noticeURL.remoteStackSize() + ")"; + if ((entry.cacheArray == null) && (entry.scraper == null)) { + log.logDebug(stats + " entry for " + entry.nomalizedURLString + " has no content -- skipped"); + return; + } + try { + // we must distinguish the following cases: resource-load was initiated by // 1) global crawling: the index is extern, not here (not possible here) // 2) result of search queries, some indexes are here (not possible here) @@ -492,15 +497,15 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi processCase = 6; } - log.logDebug("processResourceStack: processCase=" + processCase + ", depth=" + entry.depth + ", maxDepth=" + entry.profile.generalDepth() + ", filter=" + entry.profile.generalFilter() + ", initiatorHash=" + initiatorHash + ", status=" + entry.status + ", url=" + entry.url); // DEBUG + log.logDebug(stats + " processCase=" + processCase + ", depth=" + entry.depth + ", maxDepth=" + entry.profile.generalDepth() + ", filter=" + entry.profile.generalFilter() + ", initiatorHash=" + initiatorHash + ", status=" + entry.status + ", source=" + ((entry.cacheArray == null) ? "scraper" : "byte[]") + ", url=" + entry.nomalizedURLString); // DEBUG // parse content plasmaParser.document document; if (entry.scraper != null) { - log.logDebug("(Parser) '" + entry.urlString + "' is pre-parsed by scraper"); + log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is pre-parsed by scraper"); document = parser.transformScraper(entry.url, entry.responseHeader.mime(), entry.scraper); } else { - log.logDebug("(Parser) '" + entry.urlString + "' is not parsed, parsing now"); + log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is not parsed, parsing now"); document = parser.parseSource(entry.url, entry.responseHeader.mime(), entry.cacheArray); } @@ -516,11 +521,11 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi while (i.hasNext()) { e = (Map.Entry) i.next(); nexturlstring = (String) e.getKey(); - rejectReason = stackCrawl(nexturlstring, entry.urlString, initiatorHash, (String) e.getValue(), entry.lastModified, entry.depth + 1, entry.profile); + rejectReason = stackCrawl(nexturlstring, entry.nomalizedURLString, initiatorHash, (String) e.getValue(), entry.lastModified, entry.depth + 1, entry.profile); if (rejectReason == null) { c++; } else { - errorURL.newEntry(new URL(nexturlstring), entry.urlString, entry.initiator(), yacyCore.seedDB.mySeed.hash, + errorURL.newEntry(new URL(nexturlstring), entry.nomalizedURLString, entry.initiator(), yacyCore.seedDB.mySeed.hash, (String) e.getValue(), rejectReason, new bitfield(plasmaURL.urlFlagLength), false); } } @@ -543,12 +548,12 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi } if (noIndexReason == null) { // strip out words - log.logDebug("(Profile) Condensing for '" + entry.urlString + "'"); + log.logDebug("(Profile) Condensing for '" + entry.nomalizedURLString + "'"); plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(document.getText())); //log.logInfo("INDEXING HEADLINE:" + descr); try { - log.logDebug("(Profile) Create LURL-Entry for '" + entry.urlString + "'"); + log.logDebug("(Profile) Create LURL-Entry for '" + entry.nomalizedURLString + "'"); plasmaCrawlLURL.entry newEntry = loadedURL.newEntry( entry.url, descr, entry.lastModified, new Date(), initiatorHash, @@ -563,28 +568,28 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi ); String urlHash = newEntry.hash(); - log.logDebug("(Profile) Remove NURL for '" + entry.urlString + "'"); + log.logDebug("(Profile) Remove NURL for '" + entry.nomalizedURLString + "'"); noticeURL.remove(urlHash); // worked-off if (((processCase == 4) || (processCase == 5) || (processCase == 6)) && (entry.profile.localIndexing())) { // remove stopwords - log.logDebug("(Profile) Exclude Stopwords for '" + entry.urlString + "'"); + log.logDebug("(Profile) Exclude Stopwords for '" + entry.nomalizedURLString + "'"); log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + entry.url); //System.out.println("DEBUG: words left to be indexed: " + condenser.getWords()); // do indexing - log.logDebug("(Profile) Create Index for '" + entry.urlString + "'"); + log.logDebug("(Profile) Create Index for '" + entry.nomalizedURLString + "'"); int words = searchManager.addPageIndex(entry.url, urlHash, entry.lastModified, condenser, entry.language, entry.doctype); log.logInfo("Indexed " + words + " words in URL " + entry.url + " (" + descr + ")"); // if this was performed for a remote crawl request, notify requester if ((processCase == 6) && (initiator != null)) { - log.logInfo("Sending crawl receipt for '" + entry.urlString + "' to " + initiator.getName()); + log.logInfo("Sending crawl receipt for '" + entry.nomalizedURLString + "' to " + initiator.getName()); yacyClient.crawlReceipt(initiator, "crawl", "fill", "indexed", newEntry, ""); } } else { - log.logDebug("Resource '" + entry.urlString + "' not indexed (indexing is off)"); + log.logDebug("Resource '" + entry.nomalizedURLString + "' not indexed (indexing is off)"); } } catch (Exception ee) { log.logError("Could not index URL " + entry.url + ": " + ee.getMessage()); @@ -831,11 +836,12 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi } + private static SimpleDateFormat DateFormatter = new SimpleDateFormat("EEE, dd MMM yyyy"); public static String dateString(Date date) { if (date == null) return ""; else return DateFormatter.format(date); } - + public serverObjects searchFromLocal(Set querywords, String order1, String order2, int count, boolean global, long time /*milliseconds*/, String urlmask) { @@ -911,7 +917,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi filename = url.getFile(); if ((seed == null) || ((address = seed.getAddress()) == null)) { // seed is not known from here - removeReferences(urlentry.hash(), getWords(("yacyshare " + filename.replace('?', ' ') + " " + urlentry.descr()).getBytes())); + removeReferences(urlentry.hash(), plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + urlentry.descr()).getBytes())); loadedURL.remove(urlentry.hash()); // clean up continue; // next result } @@ -1062,28 +1068,6 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi // actually it is used there for testing purpose return "PROPS: " + super.toString() + "; QUEUE: " + processStack.toString(); } - - /* - private void addScoreForked(kelondroMScoreCluster ref, String no, String[] words) { - String s; - if (words != null) for (int i = 0; i < words.length; i++) { - s = words[i].trim().toLowerCase(); - if (s.indexOf(".") >= 0) addScoreForked(ref, no, s.split("\\.")); - else if (s.indexOf(",") >= 0) addScoreForked(ref, no, s.split(",")); - else if (s.indexOf(":") >= 0) addScoreForked(ref, no, s.split(":")); - else if (s.indexOf("-") >= 0) addScoreForked(ref, no, s.split("-")); - else if (s.indexOf("/") >= 0) addScoreForked(ref, no, s.split("/")); - else if (s.indexOf('"') >= 0) addScoreForked(ref, no, s.split(new String(new byte[] {(char)'"'}))); - else addScoreFiltered(ref, no, s); - } - } - private void addScoreFiltered(kelondroMScoreCluster ref, String no, String word) { - if ((word.length() > 2) && - ("http_html_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) && - (no.indexOf(word) < 0)) - ref.incScore(word); - } - */ // method for index deletion public int removeAllUrlReferences(URL url, boolean fetchOnline) { @@ -1099,7 +1083,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi URL url = entry.url(); if (url == null) return 0; // get set of words - Set words = getWords(getText(getResource(url, fetchOnline))); + Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline))); // delete all word references int count = removeReferences(urlhash, words); // finally delete the url entry itself @@ -1172,17 +1156,6 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi } } - public static Set getWords(byte[] text) { - if (text == null) return null; - ByteArrayInputStream buffer = new ByteArrayInputStream(text); - try { - plasmaCondenser condenser = new plasmaCondenser(buffer); - return condenser.getWords(); - } catch (IOException e) { - return null; - } - } - public class distributeIndex { // distributes parts of the index to other peers // stops as soon as an error occurrs @@ -1214,6 +1187,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi ((transferred = performTransferIndex(indexCount, peerCount, true)) > 0)) { indexCount = transferred; if ((System.currentTimeMillis() - starttime) > (maxTime * peerCount)) indexCount--; else indexCount++; + if (indexCount < 30) indexCount = 30; return true; } else { // make a long pause @@ -1230,6 +1204,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi public void setCounts(int indexCount, int peerCount, long pause) { this.indexCount = indexCount; + if (indexCount < 30) indexCount = 30; this.peerCount = peerCount; this.pause = pause; } diff --git a/source/de/anomic/yacy/yacyDHTAction.java b/source/de/anomic/yacy/yacyDHTAction.java index a8ea0bbf8..0308c4b68 100644 --- a/source/de/anomic/yacy/yacyDHTAction.java +++ b/source/de/anomic/yacy/yacyDHTAction.java @@ -63,7 +63,6 @@ public class yacyDHTAction implements yacyPeerAction { } } - public Enumeration getDHTSeeds(boolean up, String firstHash) { // enumerates seed-type objects: all seeds with starting point in the middle, rotating at the end/beginning return new seedDHTEnum(up, firstHash); @@ -104,10 +103,8 @@ public class yacyDHTAction implements yacyPeerAction { return e2.nextElement(); } } - } - public Enumeration getAcceptRemoteIndexSeeds(String starthash) { // returns an enumeration of yacySeed-Objects // that have the AcceptRemoteIndex-Flag set @@ -131,10 +128,16 @@ public class yacyDHTAction implements yacyPeerAction { private yacySeed nextInternal() { yacySeed s; - while (se.hasMoreElements()) { - s = (yacySeed) se.nextElement(); - if (s == null) return null; - if (s.getFlagAcceptRemoteIndex()) return s; + try { + while (se.hasMoreElements()) { + s = (yacySeed) se.nextElement(); + if (s == null) return null; + if (s.getFlagAcceptRemoteIndex()) return s; + } + } catch (kelondroException e) { + yacyCore.log.logError("database inconsistency (" + e.getMessage() + "), re-set of db."); + seedDB.resetActiveTable(); + return null; } return null; } @@ -147,7 +150,6 @@ public class yacyDHTAction implements yacyPeerAction { } - public Enumeration getAcceptRemoteCrawlSeeds(String starthash, boolean available) { return new acceptRemoteCrawlSeedEnum(starthash, available); } diff --git a/source/de/anomic/yacy/yacySeedDB.java b/source/de/anomic/yacy/yacySeedDB.java index b6fc27ba5..5a0f28c6b 100644 --- a/source/de/anomic/yacy/yacySeedDB.java +++ b/source/de/anomic/yacy/yacySeedDB.java @@ -142,7 +142,7 @@ public class yacySeedDB { private synchronized kelondroMap resetSeedTable(kelondroMap seedDB, File seedDBFile) { // this is an emergency function that should only be used if any problem with the // seed.db is detected - yacyCore.log.logError("seed-db " + seedDBFile.toString() + " reset (on-the-fly)"); + yacyCore.log.logDebug("seed-db " + seedDBFile.toString() + " reset (on-the-fly)"); try { seedDB.close(); seedDBFile.delete(); @@ -154,6 +154,10 @@ public class yacySeedDB { return seedDB; } + public synchronized void resetActiveTable() { seedActiveDB = resetSeedTable(seedActiveDB, seedActiveDBFile); } + public synchronized void resetPassiveTable() { seedPassiveDB = resetSeedTable(seedPassiveDB, seedPassiveDBFile); } + public synchronized void resetPotentialTable() { seedPotentialDB = resetSeedTable(seedPotentialDB, seedPotentialDBFile); } + public void close() { try { seedActiveDB.close(); diff --git a/yacy.blue b/yacy.blue index 054c67159..e69de29bb 100644 --- a/yacy.blue +++ b/yacy.blue @@ -1 +0,0 @@ -testblue \ No newline at end of file