- replaced unicode characters in copyright holder name ('Brausse')

- more logging for bootstrap seedlist loading
- larger DHT chunks

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4015 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 31023dbc7a
commit e76fe1c078

@ -5,7 +5,7 @@
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// This File is contributed by Franz Brauße
// This File is contributed by Franz Brausse
//
// $LastChangedDate: 2007-01-27 14:07:54 +0000 (Sa, 27 Jan 2007) $
// $LastChangedRevision: 3217 $

@ -5,7 +5,7 @@
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// This File is contributed by Franz Brauße
// This File is contributed by Franz Brausse
//
// $LastChangedDate: $
// $LastChangedRevision: $

@ -6,7 +6,7 @@
// Frankfurt, Germany, 2005
// Created 16.01.2007
//
// This File is contributed by Franz Brauße
// This File is contributed by Franz Brausse
//
// $LastChangedDate: 2007-01-17 12:00:00 +0100 (Di, 17 Jan 2007) $
// $LastChangedRevision: 3216 $

@ -145,6 +145,7 @@ public final class search {
// prepare an abstract result
StringBuffer indexabstract = new StringBuffer();
int indexabstractContainercount = 0;
int joincount = 0;
plasmaSearchPostOrder acc = null;
plasmaSearchQuery squery = null;
@ -170,6 +171,7 @@ public final class search {
entry = (Map.Entry) ci.next();
wordhash = (String) entry.getKey();
indexContainer container = (indexContainer) entry.getValue();
indexabstractContainercount += container.size();
indexabstract.append("indexabstract." + wordhash + "=").append(plasmaURL.compressIndex(container, null, 1000).toString()).append(serverCore.crlfString);
}
}
@ -215,11 +217,14 @@ public final class search {
}
d = yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhash);
if (d < mindhtdistance) {
// calculate the word hash that is closest to our dht position
mindhtdistance = d;
neardhthash = wordhash;
}
indexcount.append("indexcount.").append(container.getWordHash()).append('=').append(Integer.toString(container.size())).append(serverCore.crlfString);
if ((abstractSet != null) && (abstractSet.contains(wordhash))) {
// if a specific index-abstract is demanded, attach it here
indexabstractContainercount += container.size();
indexabstract.append("indexabstract." + wordhash + "=").append(plasmaURL.compressIndex(container, null,1000).toString()).append(serverCore.crlfString);
}
}
@ -242,10 +247,15 @@ public final class search {
if ((maxcounthash == null) || (urls.length() != 0) || (queryhashes.size() == 1) || (abstracts.length() == 0)) {
prop.putASIS("indexabstract", "");
} else if (abstracts.equals("auto")) {
indexabstract.append("indexabstract." + maxcounthash + "=").append(plasmaURL.compressIndex(((indexContainer) containers[0].get(maxcounthash)),localResults, 1000).toString()).append(serverCore.crlfString);
if ((neardhthash != null)
&& (!(neardhthash.equals(maxcounthash)))) {
indexabstract.append("indexabstract." + neardhthash + "=").append(plasmaURL.compressIndex(((indexContainer) containers[0].get(neardhthash)), localResults, 1000).toString()).append(serverCore.crlfString);
// automatically attach the index abstract for the index that has the most references. This should be our target dht position
indexContainer container = (indexContainer) containers[0].get(maxcounthash);
indexabstractContainercount += container.size();
indexabstract.append("indexabstract." + maxcounthash + "=").append(plasmaURL.compressIndex(container,localResults, 1000).toString()).append(serverCore.crlfString);
if ((neardhthash != null) && (!(neardhthash.equals(maxcounthash)))) {
// in case that the neardhthash is different from the maxcounthash attach also the neardhthash-container
container = (indexContainer) containers[0].get(neardhthash);
indexabstractContainercount += container.size();
indexabstract.append("indexabstract." + neardhthash + "=").append(plasmaURL.compressIndex(container, localResults, 1000).toString()).append(serverCore.crlfString);
}
//System.out.println("DEBUG-ABSTRACTGENERATION: maxcounthash = " + maxcounthash);
//System.out.println("DEBUG-ABSTRACTGENERATION: neardhthash = "+ neardhthash);
@ -321,7 +331,11 @@ public final class search {
prop.putASIS("fwrec", ""); // peers that would have helped to construct this result (recommendations)
// log
yacyCore.log.logInfo("EXIT HASH SEARCH: " + plasmaSearchQuery.anonymizedQueryHashes(squery.queryHashes) + " - " + joincount + " links found, " + prop.get("linkcount", "?") + " links selected, " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
yacyCore.log.logInfo("EXIT HASH SEARCH: " +
plasmaSearchQuery.anonymizedQueryHashes(squery.queryHashes) + " - " + joincount + " links found, " +
prop.get("linkcount", "?") + " links selected, " +
indexabstractContainercount + " index abstract references attached, " +
(System.currentTimeMillis() - timestamp) + " milliseconds");
prop.putASIS("searchtime", Long.toString(System.currentTimeMillis() - timestamp));

@ -6,7 +6,7 @@
// Frankfurt, Germany, 2007
// Created 03.02.2007
//
// This file is contributed by Franz Brauße
// This file is contributed by Franz Brausse
//
// $LastChangedDate: $
// $LastChangedRevision: $

@ -6,7 +6,7 @@
// Frankfurt, Germany, 2007
// Created 22.02.2007
//
// This file is contributed by Franz Brauße
// This file is contributed by Franz Brausse
//
// $LastChangedDate: $
// $LastChangedRevision: $

@ -6,7 +6,7 @@
// Frankfurt, Germany, 2007
// Created 22.02.2007
//
// This file is contributed by Franz Brauße
// This file is contributed by Franz Brausse
//
// $LastChangedDate: $
// $LastChangedRevision: $

@ -6,7 +6,7 @@
// Frankfurt, Germany, 2007
// Created 22.02.2007
//
// This file is contributed by Franz Brauße
// This file is contributed by Franz Brausse
//
// $LastChangedDate: $
// $LastChangedRevision: $

@ -6,7 +6,7 @@
// Frankfurt, Germany, 2007
// Created 22.02.2007
//
// This file is contributed by Franz Brauße
// This file is contributed by Franz Brausse
//
// $LastChangedDate: $
// $LastChangedRevision: $

@ -6,7 +6,7 @@
// Frankfurt, Germany, 2007
// Created 22.02.2007
//
// This file is contributed by Franz Brauße
// This file is contributed by Franz Brausse
//
// $LastChangedDate: $
// $LastChangedRevision: $

@ -6,7 +6,7 @@
// Frankfurt, Germany, 2007
// Created 22.02.2007
//
// This file is contributed by Franz Brauße
// This file is contributed by Franz Brausse
//
// $LastChangedDate: $
// $LastChangedRevision: $

@ -6,7 +6,7 @@
// Frankfurt, Germany, 2007
// Created 22.02.2007
//
// This file is contributed by Franz Brauße
// This file is contributed by Franz Brausse
//
// $LastChangedDate: $
// $LastChangedRevision: $

@ -6,7 +6,7 @@
// Frankfurt, Germany, 2007
// Created 22.02.2007
//
// This file is contributed by Franz Brauße
// This file is contributed by Franz Brausse
//
// $LastChangedDate: $
// $LastChangedRevision: $

@ -6,7 +6,7 @@
// Frankfurt, Germany, 2007
// Created 22.02.2007
//
// This file is contributed by Franz Brauße
// This file is contributed by Franz Brausse
//
// $LastChangedDate: $
// $LastChangedRevision: $

@ -5,7 +5,7 @@
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// This file ist contributed by Franz Brausze
// This file ist contributed by Franz Brausse
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by

@ -5,7 +5,7 @@
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// This file ist contributed by Franz Brausze
// This file ist contributed by Franz Brausse
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by

@ -5,7 +5,7 @@
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// This file ist contributed by Franz Brausze
// This file ist contributed by Franz Brausse
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by

@ -178,7 +178,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public static int indexingSlots = 30;
public static int stackCrawlSlots = 1000000;
private int dhtTransferIndexCount = 50;
private int dhtTransferIndexCount = 100;
// we must distinguish the following cases: resource-load was initiated by
// 1) global crawling: the index is extern, not here (not possible here)

@ -0,0 +1,281 @@
// crawlHandler.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 24.07.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.xml;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class crawlHandler extends DefaultHandler {
// statics for item generation and automatic categorization
private static int guidcount = 0;
private static final String[] startpointTags = new String[]{
"author", //
"copyright", //
"category", //
"title", //
"link", //
"language", //
"description", //
"creator", //
"pubDate", //
"guid", //
"docs" //
};
private static final HashSet startpointTagsSet = new HashSet();
static {
for (int i = 0; i < startpointTags.length; i++) {
startpointTagsSet.add(startpointTags[i]);
}
}
// class variables
private Startpoint channel, startpoint;
private StringBuffer buffer;
private boolean parsingAttributes, parsingStartpoint;
private ArrayList startpointsGUID; // a list of GUIDs, so the items can be retrieved by a specific order
private HashMap startpoints; // a guid:Item map
public crawlHandler(String path) {
init();
parse(path);
}
public crawlHandler(InputStream stream) {
init();
parse(stream);
}
private void init() {
startpointsGUID = new ArrayList();
startpoints = new HashMap();
buffer = new StringBuffer();
startpoint = null;
channel = null;
parsingAttributes = false;
parsingStartpoint = false;
}
private void parse(String path) {
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser = factory.newSAXParser();
saxParser.parse(path, this);
} catch (Exception e) {
e.printStackTrace();
}
}
private void parse(InputStream stream) {
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser = factory.newSAXParser();
saxParser.parse(stream, this);
} catch (Exception e) {
e.printStackTrace();
}
}
public void startElement(String uri, String name, String tag, Attributes atts) throws SAXException {
if ("channel".equals(tag)) {
channel = new Startpoint();
parsingAttributes = true;
} else if ("item".equals(tag)) {
startpoint = new Startpoint();
parsingStartpoint = true;
}
}
public void endElement(String uri, String name, String tag) {
if (tag == null) return;
if ("channel".equals(tag)) {
parsingAttributes = false;
} else if ("item".equals(tag)) {
String guid = startpoint.getGuid();
startpointsGUID.add(guid);
startpoints.put(guid, startpoint);
parsingStartpoint = false;
} else if (parsingStartpoint) {
String value = buffer.toString().trim();
buffer.setLength(0);
if (startpointTagsSet.contains(tag)) startpoint.setValue(tag, value);
} else if (parsingAttributes) {
String value = buffer.toString().trim();
buffer.setLength(0);
if (startpointTagsSet.contains(tag)) channel.setValue(tag, value);
}
}
public void characters(char ch[], int start, int length) {
if (parsingStartpoint || parsingAttributes) {
buffer.append(ch, start, length);
}
}
public Startpoint getChannel() {
return channel;
}
public Startpoint getStartpoint(int i) {
// retrieve item by order number
return getStartpoint((String) startpointsGUID.get(i));
}
public Startpoint getStartpoint(String guid) {
// retrieve item by guid
return (Startpoint) startpoints.get(guid);
}
public int startpoints() {
return startpoints.size();
}
public static class Attributes {
private HashMap map;
public Attributes() {
this.map = new HashMap();
}
public void setValue(String name, String value) {
map.put(name, value);
}
public String getAuthor() {
return (String) map.get("author");
}
public String getCopyright() {
return (String) map.get("copyright");
}
public String getCategory() {
return (String) map.get("category");
}
public String getTitle() {
return (String) map.get("title");
}
public String getLink() {
return (String) map.get("link");
}
public String getLanguage() {
return (String) map.get("language");
}
public String getDescription() {
return (String) map.get("description");
}
public String getCreator() {
return (String) map.get("creator");
}
public String getPubDate() {
return (String) map.get("pubDate");
}
public String getGuid() {
return (String) map.get("guid");
}
public String getDocs() {
return (String) map.get("docs");
}
}
public static class Startpoint {
private HashMap map;
public Startpoint() {
this.map = new HashMap();
this.map.put("guid", Long.toHexString(System.currentTimeMillis()) + ":" + guidcount++);
}
public void setValue(String name, String value) {
map.put(name, value);
}
public String getAuthor() {
return (String) map.get("author");
}
public String getCopyright() {
return (String) map.get("copyright");
}
public String getCategory() {
return (String) map.get("category");
}
public String getTitle() {
return (String) map.get("title");
}
public String getLink() {
return (String) map.get("link");
}
public String getLanguage() {
return (String) map.get("language");
}
public String getDescription() {
return (String) map.get("description");
}
public String getCreator() {
return (String) map.get("creator");
}
public String getPubDate() {
return (String) map.get("pubDate");
}
public String getGuid() {
return (String) map.get("guid");
}
public String getDocs() {
return (String) map.get("docs");
}
}
}

@ -170,9 +170,17 @@ public class yacyPeerActions {
reqHeader.put(httpHeader.CACHE_CONTROL,"no-cache");
url = new URL(seedListFileURL);
long start = System.currentTimeMillis();
header = httpc.whead(url, url.getHost(), this.bootstrapLoadTimeout, null, null, this.sb.remoteProxyConfig,reqHeader);
if ((header == null) || (header.lastModified() == null)) {
yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not available");
long loadtime = System.currentTimeMillis() - start;
if (header == null) {
if (loadtime > this.bootstrapLoadTimeout) {
yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not available, time-out after " + loadtime + " milliseconds");
} else {
yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not available, no content");
}
} else if (header.lastModified() == null) {
yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not usable, last-modified is missing");
} else if ((header.age() > 86400000) && (ssc > 0)) {
yacyCore.log.logInfo("BOOTSTRAP: seed-list URL " + seedListFileURL + " too old (" + (header.age() / 86400000) + " days)");
} else {

@ -756,9 +756,9 @@ indexDistribution.maxOpenFiles = 800
indexTransfer.maxOpenFiles = 800
# sizes for index distribution
indexDistribution.minChunkSize = 5
indexDistribution.minChunkSize = 10
indexDistribution.maxChunkSize = 1000
indexDistribution.startChunkSize = 50
indexDistribution.startChunkSize = 200
indexDistribution.maxChunkFails = 1

Loading…
Cancel
Save