Merge branch 'master' of ssh://git@gitorious.org/yacy/rc1.git

pull/1/head
orbiter 13 years ago
commit e76159040b

@ -466,7 +466,13 @@ public class Crawler_p {
// get links and generate filter
final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());
if (newcrawlingdepth > 0) {
if (fullDomain) {
newcrawlingMustMatch = siteFilter(hyperlinks.keySet());
} else if (subPath) {
newcrawlingMustMatch = subpathFilter(hyperlinks.keySet());
}
}
final DigestURI crawlURL = new DigestURI("file://" + crawlingFile.toString());
final CrawlProfile profile = new CrawlProfile(
@ -681,4 +687,16 @@ public class Crawler_p {
}
return filter.length() > 0 ? filter.substring(1) : "";
}
private static String subpathFilter(final Set<MultiProtocolURI> uris) {
final StringBuilder filter = new StringBuilder();
final Set<String> filterSet = new HashSet<String>();
for (final MultiProtocolURI uri: uris) {
filterSet.add(new StringBuilder().append(uri.toNormalform(true, false)).append(".*").toString());
}
for (final String element : filterSet) {
filter.append('|').append(element);
}
return filter.length() > 0 ? filter.substring(1) : "";
}
}

@ -443,7 +443,6 @@ public final class CrawlStacker {
if (oldEntry == null) {
if (dbocc != null) {
// do double-check
if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is double registered in '" + dbocc + "'.");
if (dbocc.equals("errors")) {
final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash());
return "double in: errors (" + errorEntry.anycause() + ")";

@ -167,7 +167,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
final Entry entry = new Entry(bentry, executor, workdate, workcount, reason);
put(entry);
this.stack.add(entry.hash());
Log.logInfo("Rejected URL", bentry.url().toNormalform(false, false) + " - " + reason);
if (!reason.startsWith("double")) Log.logInfo("Rejected URL", bentry.url().toNormalform(false, false) + " - " + reason);
if (this.solrConnector != null && failCategory.store) {
// send the error to solr
try {

@ -31,8 +31,8 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.lang.reflect.InvocationTargetException;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.logging.FileHandler;
import java.util.logging.Level;
import java.util.logging.LogManager;
@ -329,7 +329,7 @@ public final class Log {
}
protected final static logEntry poison = new logEntry();
protected final static BlockingQueue<logEntry> logQueue = new LinkedBlockingQueue<logEntry>();
protected final static BlockingQueue<logEntry> logQueue = new ArrayBlockingQueue<logEntry>(300);
private final static logRunner logRunnerThread = new logRunner();
static {

@ -292,7 +292,7 @@ public final class Switchboard extends serverSwitch
sb = this;
// set loglevel and log
setLog(new Log("YACY_SEARCH"));
setLog(new Log("SWITCHBOARD"));
// set default peer name
Seed.ANON_PREFIX = getConfig("peernameprefix", "_anon");
@ -1661,10 +1661,6 @@ public final class Switchboard extends serverSwitch
}
// put document into the concurrent processing queue
if ( this.log.isFinest() ) {
this.log.logFinest("deQueue: passing to indexing queue: "
+ response.url().toNormalform(true, false));
}
try {
this.indexingDocumentProcessor.enQueue(new indexingQueueEntry(
response,
@ -2249,11 +2245,6 @@ public final class Switchboard extends serverSwitch
public indexingQueueEntry parseDocument(final indexingQueueEntry in) {
in.queueEntry.updateStatus(Response.QUEUE_STATE_PARSING);
// debug
if ( this.log.isFinest() ) {
this.log.logFinest("PARSE " + in.queueEntry);
}
Document[] documents = null;
try {
documents = parseDocument(in.queueEntry);
@ -2487,9 +2478,6 @@ public final class Switchboard extends serverSwitch
}
in.documents = doclist.toArray(new Document[doclist.size()]);
final Condenser[] condenser = new Condenser[in.documents.length];
if ( this.log.isFine() ) {
this.log.logFine("Condensing for '" + in.queueEntry.url().toNormalform(false, true) + "'");
}
for ( int i = 0; i < in.documents.length; i++ ) {
condenser[i] =
new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry

Loading…
Cancel
Save