diff --git a/htroot/BlacklistCleaner_p.java b/htroot/BlacklistCleaner_p.java index d75ad72e8..7e0c24d78 100644 --- a/htroot/BlacklistCleaner_p.java +++ b/htroot/BlacklistCleaner_p.java @@ -5,7 +5,7 @@ // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // -// This File is contributed by Franz Brauße +// This File is contributed by Franz Brausse // // $LastChangedDate: 2007-01-27 14:07:54 +0000 (Sa, 27 Jan 2007) $ // $LastChangedRevision: 3217 $ diff --git a/htroot/ConfigRobotsTxt_p.java b/htroot/ConfigRobotsTxt_p.java index 2bf1073fb..a2051dc03 100644 --- a/htroot/ConfigRobotsTxt_p.java +++ b/htroot/ConfigRobotsTxt_p.java @@ -5,7 +5,7 @@ // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // -// This File is contributed by Franz Brauße +// This File is contributed by Franz Brausse // // $LastChangedDate: $ // $LastChangedRevision: $ diff --git a/htroot/LogStatistics_p.java b/htroot/LogStatistics_p.java index bd680415e..3188f6595 100644 --- a/htroot/LogStatistics_p.java +++ b/htroot/LogStatistics_p.java @@ -6,7 +6,7 @@ // Frankfurt, Germany, 2005 // Created 16.01.2007 // -// This File is contributed by Franz Brauße +// This File is contributed by Franz Brausse // // $LastChangedDate: 2007-01-17 12:00:00 +0100 (Di, 17 Jan 2007) $ // $LastChangedRevision: 3216 $ diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 1e90cdf59..2b7ea961c 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -145,6 +145,7 @@ public final class search { // prepare an abstract result StringBuffer indexabstract = new StringBuffer(); + int indexabstractContainercount = 0; int joincount = 0; plasmaSearchPostOrder acc = null; plasmaSearchQuery squery = null; @@ -170,6 +171,7 @@ public final class search { entry = (Map.Entry) ci.next(); wordhash = (String) entry.getKey(); indexContainer container = (indexContainer) entry.getValue(); + indexabstractContainercount += container.size(); indexabstract.append("indexabstract." + wordhash + "=").append(plasmaURL.compressIndex(container, null, 1000).toString()).append(serverCore.crlfString); } } @@ -215,11 +217,14 @@ public final class search { } d = yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhash); if (d < mindhtdistance) { + // calculate the word hash that is closest to our dht position mindhtdistance = d; neardhthash = wordhash; } indexcount.append("indexcount.").append(container.getWordHash()).append('=').append(Integer.toString(container.size())).append(serverCore.crlfString); if ((abstractSet != null) && (abstractSet.contains(wordhash))) { + // if a specific index-abstract is demanded, attach it here + indexabstractContainercount += container.size(); indexabstract.append("indexabstract." + wordhash + "=").append(plasmaURL.compressIndex(container, null,1000).toString()).append(serverCore.crlfString); } } @@ -242,10 +247,15 @@ public final class search { if ((maxcounthash == null) || (urls.length() != 0) || (queryhashes.size() == 1) || (abstracts.length() == 0)) { prop.putASIS("indexabstract", ""); } else if (abstracts.equals("auto")) { - indexabstract.append("indexabstract." + maxcounthash + "=").append(plasmaURL.compressIndex(((indexContainer) containers[0].get(maxcounthash)),localResults, 1000).toString()).append(serverCore.crlfString); - if ((neardhthash != null) - && (!(neardhthash.equals(maxcounthash)))) { - indexabstract.append("indexabstract." + neardhthash + "=").append(plasmaURL.compressIndex(((indexContainer) containers[0].get(neardhthash)), localResults, 1000).toString()).append(serverCore.crlfString); + // automatically attach the index abstract for the index that has the most references. This should be our target dht position + indexContainer container = (indexContainer) containers[0].get(maxcounthash); + indexabstractContainercount += container.size(); + indexabstract.append("indexabstract." + maxcounthash + "=").append(plasmaURL.compressIndex(container,localResults, 1000).toString()).append(serverCore.crlfString); + if ((neardhthash != null) && (!(neardhthash.equals(maxcounthash)))) { + // in case that the neardhthash is different from the maxcounthash attach also the neardhthash-container + container = (indexContainer) containers[0].get(neardhthash); + indexabstractContainercount += container.size(); + indexabstract.append("indexabstract." + neardhthash + "=").append(plasmaURL.compressIndex(container, localResults, 1000).toString()).append(serverCore.crlfString); } //System.out.println("DEBUG-ABSTRACTGENERATION: maxcounthash = " + maxcounthash); //System.out.println("DEBUG-ABSTRACTGENERATION: neardhthash = "+ neardhthash); @@ -321,7 +331,11 @@ public final class search { prop.putASIS("fwrec", ""); // peers that would have helped to construct this result (recommendations) // log - yacyCore.log.logInfo("EXIT HASH SEARCH: " + plasmaSearchQuery.anonymizedQueryHashes(squery.queryHashes) + " - " + joincount + " links found, " + prop.get("linkcount", "?") + " links selected, " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds"); + yacyCore.log.logInfo("EXIT HASH SEARCH: " + + plasmaSearchQuery.anonymizedQueryHashes(squery.queryHashes) + " - " + joincount + " links found, " + + prop.get("linkcount", "?") + " links selected, " + + indexabstractContainercount + " index abstract references attached, " + + (System.currentTimeMillis() - timestamp) + " milliseconds"); prop.putASIS("searchtime", Long.toString(System.currentTimeMillis() - timestamp)); diff --git a/source/de/anomic/data/diff.java b/source/de/anomic/data/diff.java index 0767b1136..0729caded 100644 --- a/source/de/anomic/data/diff.java +++ b/source/de/anomic/data/diff.java @@ -6,7 +6,7 @@ // Frankfurt, Germany, 2007 // Created 03.02.2007 // -// This file is contributed by Franz Brauße +// This file is contributed by Franz Brausse // // $LastChangedDate: $ // $LastChangedRevision: $ diff --git a/source/de/anomic/data/wiki/knwikiParser.java b/source/de/anomic/data/wiki/knwikiParser.java index 1b0ac8791..f7eeb9e02 100644 --- a/source/de/anomic/data/wiki/knwikiParser.java +++ b/source/de/anomic/data/wiki/knwikiParser.java @@ -6,7 +6,7 @@ // Frankfurt, Germany, 2007 // Created 22.02.2007 // -// This file is contributed by Franz Brauße +// This file is contributed by Franz Brausse // // $LastChangedDate: $ // $LastChangedRevision: $ diff --git a/source/de/anomic/data/wiki/tokens/AbstractToken.java b/source/de/anomic/data/wiki/tokens/AbstractToken.java index 231afecb3..dc3985d0e 100644 --- a/source/de/anomic/data/wiki/tokens/AbstractToken.java +++ b/source/de/anomic/data/wiki/tokens/AbstractToken.java @@ -6,7 +6,7 @@ // Frankfurt, Germany, 2007 // Created 22.02.2007 // -// This file is contributed by Franz Brauße +// This file is contributed by Franz Brausse // // $LastChangedDate: $ // $LastChangedRevision: $ diff --git a/source/de/anomic/data/wiki/tokens/DefinitionListToken.java b/source/de/anomic/data/wiki/tokens/DefinitionListToken.java index 726cc3d63..effb77e3c 100644 --- a/source/de/anomic/data/wiki/tokens/DefinitionListToken.java +++ b/source/de/anomic/data/wiki/tokens/DefinitionListToken.java @@ -6,7 +6,7 @@ // Frankfurt, Germany, 2007 // Created 22.02.2007 // -// This file is contributed by Franz Brauße +// This file is contributed by Franz Brausse // // $LastChangedDate: $ // $LastChangedRevision: $ diff --git a/source/de/anomic/data/wiki/tokens/LinkToken.java b/source/de/anomic/data/wiki/tokens/LinkToken.java index 3e27b1bff..cb067e6e4 100644 --- a/source/de/anomic/data/wiki/tokens/LinkToken.java +++ b/source/de/anomic/data/wiki/tokens/LinkToken.java @@ -6,7 +6,7 @@ // Frankfurt, Germany, 2007 // Created 22.02.2007 // -// This file is contributed by Franz Brauße +// This file is contributed by Franz Brausse // // $LastChangedDate: $ // $LastChangedRevision: $ diff --git a/source/de/anomic/data/wiki/tokens/ListToken.java b/source/de/anomic/data/wiki/tokens/ListToken.java index 16b7d7113..50b6845aa 100644 --- a/source/de/anomic/data/wiki/tokens/ListToken.java +++ b/source/de/anomic/data/wiki/tokens/ListToken.java @@ -6,7 +6,7 @@ // Frankfurt, Germany, 2007 // Created 22.02.2007 // -// This file is contributed by Franz Brauße +// This file is contributed by Franz Brausse // // $LastChangedDate: $ // $LastChangedRevision: $ diff --git a/source/de/anomic/data/wiki/tokens/SimpleToken.java b/source/de/anomic/data/wiki/tokens/SimpleToken.java index ac701c429..06e2270d0 100644 --- a/source/de/anomic/data/wiki/tokens/SimpleToken.java +++ b/source/de/anomic/data/wiki/tokens/SimpleToken.java @@ -6,7 +6,7 @@ // Frankfurt, Germany, 2007 // Created 22.02.2007 // -// This file is contributed by Franz Brauße +// This file is contributed by Franz Brausse // // $LastChangedDate: $ // $LastChangedRevision: $ diff --git a/source/de/anomic/data/wiki/tokens/TableToken.java b/source/de/anomic/data/wiki/tokens/TableToken.java index 2fd8e13b1..fc9e41d62 100644 --- a/source/de/anomic/data/wiki/tokens/TableToken.java +++ b/source/de/anomic/data/wiki/tokens/TableToken.java @@ -6,7 +6,7 @@ // Frankfurt, Germany, 2007 // Created 22.02.2007 // -// This file is contributed by Franz Brauße +// This file is contributed by Franz Brausse // // $LastChangedDate: $ // $LastChangedRevision: $ diff --git a/source/de/anomic/data/wiki/tokens/Token.java b/source/de/anomic/data/wiki/tokens/Token.java index 0d5675e9c..335345b4c 100644 --- a/source/de/anomic/data/wiki/tokens/Token.java +++ b/source/de/anomic/data/wiki/tokens/Token.java @@ -6,7 +6,7 @@ // Frankfurt, Germany, 2007 // Created 22.02.2007 // -// This file is contributed by Franz Brauße +// This file is contributed by Franz Brausse // // $LastChangedDate: $ // $LastChangedRevision: $ diff --git a/source/de/anomic/http/httpdRobotsTxtConfig.java b/source/de/anomic/http/httpdRobotsTxtConfig.java index 2086b2697..dac694c9b 100644 --- a/source/de/anomic/http/httpdRobotsTxtConfig.java +++ b/source/de/anomic/http/httpdRobotsTxtConfig.java @@ -6,7 +6,7 @@ // Frankfurt, Germany, 2007 // Created 22.02.2007 // -// This file is contributed by Franz Brauße +// This file is contributed by Franz Brausse // // $LastChangedDate: $ // $LastChangedRevision: $ diff --git a/source/de/anomic/plasma/parser/sevenzip/ByteArrayIInStream.java b/source/de/anomic/plasma/parser/sevenzip/ByteArrayIInStream.java index e29489e1f..9e53da8d8 100644 --- a/source/de/anomic/plasma/parser/sevenzip/ByteArrayIInStream.java +++ b/source/de/anomic/plasma/parser/sevenzip/ByteArrayIInStream.java @@ -5,7 +5,7 @@ // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // -// This file ist contributed by Franz Brausze +// This file ist contributed by Franz Brausse // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by diff --git a/source/de/anomic/plasma/parser/sevenzip/SZParserExtractCallback.java b/source/de/anomic/plasma/parser/sevenzip/SZParserExtractCallback.java index 64ace4f37..0a04e0a6e 100644 --- a/source/de/anomic/plasma/parser/sevenzip/SZParserExtractCallback.java +++ b/source/de/anomic/plasma/parser/sevenzip/SZParserExtractCallback.java @@ -5,7 +5,7 @@ // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // -// This file ist contributed by Franz Brausze +// This file ist contributed by Franz Brausse // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by diff --git a/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java b/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java index 9032a9603..c6aa6fa7e 100644 --- a/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java +++ b/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java @@ -5,7 +5,7 @@ // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // -// This file ist contributed by Franz Brausze +// This file ist contributed by Franz Brausse // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 1723bc941..6f05a34a7 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -178,7 +178,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public static int indexingSlots = 30; public static int stackCrawlSlots = 1000000; - private int dhtTransferIndexCount = 50; + private int dhtTransferIndexCount = 100; // we must distinguish the following cases: resource-load was initiated by // 1) global crawling: the index is extern, not here (not possible here) diff --git a/source/de/anomic/xml/crawlHandler.java b/source/de/anomic/xml/crawlHandler.java new file mode 100644 index 000000000..fd6aec6cb --- /dev/null +++ b/source/de/anomic/xml/crawlHandler.java @@ -0,0 +1,281 @@ +// crawlHandler.java +// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 24.07.2007 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.xml; + +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; + +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +public class crawlHandler extends DefaultHandler { + + // statics for item generation and automatic categorization + private static int guidcount = 0; + private static final String[] startpointTags = new String[]{ + "author", // + "copyright", // + "category", // + "title", // + "link", // + "language", // + "description", // + "creator", // + "pubDate", // + "guid", // + "docs" // + }; + + private static final HashSet startpointTagsSet = new HashSet(); + static { + for (int i = 0; i < startpointTags.length; i++) { + startpointTagsSet.add(startpointTags[i]); + } + } + + // class variables + private Startpoint channel, startpoint; + private StringBuffer buffer; + private boolean parsingAttributes, parsingStartpoint; + private ArrayList startpointsGUID; // a list of GUIDs, so the items can be retrieved by a specific order + private HashMap startpoints; // a guid:Item map + + + public crawlHandler(String path) { + init(); + parse(path); + } + + public crawlHandler(InputStream stream) { + init(); + parse(stream); + } + + private void init() { + startpointsGUID = new ArrayList(); + startpoints = new HashMap(); + buffer = new StringBuffer(); + startpoint = null; + channel = null; + parsingAttributes = false; + parsingStartpoint = false; + } + + private void parse(String path) { + try { + SAXParserFactory factory = SAXParserFactory.newInstance(); + SAXParser saxParser = factory.newSAXParser(); + saxParser.parse(path, this); + } catch (Exception e) { + e.printStackTrace(); + } + } + + private void parse(InputStream stream) { + try { + SAXParserFactory factory = SAXParserFactory.newInstance(); + SAXParser saxParser = factory.newSAXParser(); + saxParser.parse(stream, this); + } catch (Exception e) { + e.printStackTrace(); + } + } + + public void startElement(String uri, String name, String tag, Attributes atts) throws SAXException { + if ("channel".equals(tag)) { + channel = new Startpoint(); + parsingAttributes = true; + } else if ("item".equals(tag)) { + startpoint = new Startpoint(); + parsingStartpoint = true; + } + } + + public void endElement(String uri, String name, String tag) { + if (tag == null) return; + if ("channel".equals(tag)) { + parsingAttributes = false; + } else if ("item".equals(tag)) { + String guid = startpoint.getGuid(); + startpointsGUID.add(guid); + startpoints.put(guid, startpoint); + parsingStartpoint = false; + } else if (parsingStartpoint) { + String value = buffer.toString().trim(); + buffer.setLength(0); + if (startpointTagsSet.contains(tag)) startpoint.setValue(tag, value); + } else if (parsingAttributes) { + String value = buffer.toString().trim(); + buffer.setLength(0); + if (startpointTagsSet.contains(tag)) channel.setValue(tag, value); + } + } + + public void characters(char ch[], int start, int length) { + if (parsingStartpoint || parsingAttributes) { + buffer.append(ch, start, length); + } + } + + public Startpoint getChannel() { + return channel; + } + + public Startpoint getStartpoint(int i) { + // retrieve item by order number + return getStartpoint((String) startpointsGUID.get(i)); + } + + public Startpoint getStartpoint(String guid) { + // retrieve item by guid + return (Startpoint) startpoints.get(guid); + } + + public int startpoints() { + return startpoints.size(); + } + + public static class Attributes { + + private HashMap map; + + public Attributes() { + this.map = new HashMap(); + } + + public void setValue(String name, String value) { + map.put(name, value); + } + + public String getAuthor() { + return (String) map.get("author"); + } + + public String getCopyright() { + return (String) map.get("copyright"); + } + + public String getCategory() { + return (String) map.get("category"); + } + + public String getTitle() { + return (String) map.get("title"); + } + + public String getLink() { + return (String) map.get("link"); + } + + public String getLanguage() { + return (String) map.get("language"); + } + + public String getDescription() { + return (String) map.get("description"); + } + + public String getCreator() { + return (String) map.get("creator"); + } + + public String getPubDate() { + return (String) map.get("pubDate"); + } + + public String getGuid() { + return (String) map.get("guid"); + } + + public String getDocs() { + return (String) map.get("docs"); + } + } + + public static class Startpoint { + + private HashMap map; + + public Startpoint() { + this.map = new HashMap(); + this.map.put("guid", Long.toHexString(System.currentTimeMillis()) + ":" + guidcount++); + } + + public void setValue(String name, String value) { + map.put(name, value); + } + + public String getAuthor() { + return (String) map.get("author"); + } + + public String getCopyright() { + return (String) map.get("copyright"); + } + + public String getCategory() { + return (String) map.get("category"); + } + + public String getTitle() { + return (String) map.get("title"); + } + + public String getLink() { + return (String) map.get("link"); + } + + public String getLanguage() { + return (String) map.get("language"); + } + + public String getDescription() { + return (String) map.get("description"); + } + + public String getCreator() { + return (String) map.get("creator"); + } + + public String getPubDate() { + return (String) map.get("pubDate"); + } + + public String getGuid() { + return (String) map.get("guid"); + } + + public String getDocs() { + return (String) map.get("docs"); + } + } +} \ No newline at end of file diff --git a/source/de/anomic/yacy/yacyPeerActions.java b/source/de/anomic/yacy/yacyPeerActions.java index a591fd680..7acca2684 100644 --- a/source/de/anomic/yacy/yacyPeerActions.java +++ b/source/de/anomic/yacy/yacyPeerActions.java @@ -170,9 +170,17 @@ public class yacyPeerActions { reqHeader.put(httpHeader.CACHE_CONTROL,"no-cache"); url = new URL(seedListFileURL); + long start = System.currentTimeMillis(); header = httpc.whead(url, url.getHost(), this.bootstrapLoadTimeout, null, null, this.sb.remoteProxyConfig,reqHeader); - if ((header == null) || (header.lastModified() == null)) { - yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not available"); + long loadtime = System.currentTimeMillis() - start; + if (header == null) { + if (loadtime > this.bootstrapLoadTimeout) { + yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not available, time-out after " + loadtime + " milliseconds"); + } else { + yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not available, no content"); + } + } else if (header.lastModified() == null) { + yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not usable, last-modified is missing"); } else if ((header.age() > 86400000) && (ssc > 0)) { yacyCore.log.logInfo("BOOTSTRAP: seed-list URL " + seedListFileURL + " too old (" + (header.age() / 86400000) + " days)"); } else { diff --git a/yacy.init b/yacy.init index 3b31323df..bf10aa38e 100644 --- a/yacy.init +++ b/yacy.init @@ -756,9 +756,9 @@ indexDistribution.maxOpenFiles = 800 indexTransfer.maxOpenFiles = 800 # sizes for index distribution -indexDistribution.minChunkSize = 5 +indexDistribution.minChunkSize = 10 indexDistribution.maxChunkSize = 1000 -indexDistribution.startChunkSize = 50 +indexDistribution.startChunkSize = 200 indexDistribution.maxChunkFails = 1