- replaced load failure logging by information which is stored in Solr - fixed a bug with crawling of feeds: added must-match pattern application to feed urls to filter out such urls which shall not be in a wanted domain - delegatedURLs, which also used ZURLs are now temporary objects in memorypull/1/head
parent
31920385f7
commit
2602be8d1e
@ -0,0 +1,39 @@
|
||||
/**
|
||||
* FailCategory
|
||||
* Copyright 2013 by Michael Peter Christen
|
||||
* First released 17.10.2013 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.cora.federate.solr;
|
||||
|
||||
public enum FailCategory {
|
||||
// TEMPORARY categories are such failure cases that should be tried again
|
||||
// FINAL categories are such failure cases that are final and should not be tried again
|
||||
TEMPORARY_NETWORK_FAILURE(true, FailType.fail), // an entity could not been loaded
|
||||
FINAL_PROCESS_CONTEXT(false, FailType.excl), // because of a processing context we do not want that url again (i.e. remote crawling)
|
||||
FINAL_LOAD_CONTEXT(false, FailType.excl), // the crawler configuration does not want to load the entity
|
||||
FINAL_ROBOTS_RULE(true, FailType.excl), // a remote server denies indexing or loading
|
||||
FINAL_REDIRECT_RULE(true, FailType.excl); // the remote server redirects this page, thus disallowing reading of content
|
||||
|
||||
public final boolean store;
|
||||
public final FailType failType;
|
||||
|
||||
private FailCategory(boolean store, FailType failType) {
|
||||
this.store = store;
|
||||
this.failType = failType;
|
||||
}
|
||||
}
|
@ -1,365 +0,0 @@
|
||||
// plasmaCrawlZURL.java
|
||||
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 15.03.2007 on http://www.anomic.de
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate$
|
||||
// $LastChangedRevision$
|
||||
// $LastChangedBy$
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package net.yacy.crawler.data;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
import net.yacy.cora.document.encoding.UTF8;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.federate.solr.FailType;
|
||||
import net.yacy.cora.order.Base64Order;
|
||||
import net.yacy.cora.order.NaturalOrder;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.cora.util.SpaceExceededException;
|
||||
import net.yacy.crawler.retrieval.Request;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.index.Index;
|
||||
import net.yacy.kelondro.index.Row;
|
||||
import net.yacy.kelondro.table.SplitTable;
|
||||
import net.yacy.kelondro.table.Table;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
import net.yacy.search.index.Fulltext;
|
||||
|
||||
public class ZURL implements Iterable<ZURL.Entry> {
|
||||
|
||||
private static ConcurrentLog log = new ConcurrentLog("REJECTED");
|
||||
|
||||
private static final int EcoFSBufferSize = 2000;
|
||||
private static final int maxStackSize = 1000;
|
||||
|
||||
public enum FailCategory {
|
||||
// TEMPORARY categories are such failure cases that should be tried again
|
||||
// FINAL categories are such failure cases that are final and should not be tried again
|
||||
TEMPORARY_NETWORK_FAILURE(true, FailType.fail), // an entity could not been loaded
|
||||
FINAL_PROCESS_CONTEXT(false, FailType.excl), // because of a processing context we do not want that url again (i.e. remote crawling)
|
||||
FINAL_LOAD_CONTEXT(false, FailType.excl), // the crawler configuration does not want to load the entity
|
||||
FINAL_ROBOTS_RULE(true, FailType.excl), // a remote server denies indexing or loading
|
||||
FINAL_REDIRECT_RULE(true, FailType.excl); // the remote server redirects this page, thus disallowing reading of content
|
||||
|
||||
public final boolean store;
|
||||
public final FailType failType;
|
||||
|
||||
private FailCategory(boolean store, FailType failType) {
|
||||
this.store = store;
|
||||
this.failType = failType;
|
||||
}
|
||||
}
|
||||
|
||||
private final static Row rowdef = new Row(
|
||||
"String urlhash-" + Word.commonHashLength + ", " + // the url's hash
|
||||
"String executor-" + Word.commonHashLength + ", " + // the crawling executor
|
||||
"Cardinal workdate-8 {b256}, " + // the time when the url was last time tried to load
|
||||
"Cardinal workcount-4 {b256}, " + // number of load retries
|
||||
"String anycause-132, " + // string describing load failure
|
||||
"byte[] entry-" + Request.rowdef.objectsize, // extra space
|
||||
Base64Order.enhancedCoder
|
||||
);
|
||||
|
||||
// the class object
|
||||
private Index urlIndex;
|
||||
private final Queue<byte[]> stack;
|
||||
private final Fulltext fulltext;
|
||||
|
||||
protected ZURL(
|
||||
final Fulltext fulltext,
|
||||
final File cachePath,
|
||||
final String tablename,
|
||||
final boolean startWithEmptyFile,
|
||||
final boolean useTailCache,
|
||||
final boolean exceed134217727) {
|
||||
this.fulltext = fulltext;
|
||||
// creates a new ZURL in a file
|
||||
cachePath.mkdirs();
|
||||
final File f = new File(cachePath, tablename);
|
||||
if (startWithEmptyFile) {
|
||||
if (f.exists()) {
|
||||
if (f.isDirectory()) SplitTable.delete(cachePath, tablename); else FileUtils.deletedelete(f);
|
||||
}
|
||||
}
|
||||
try {
|
||||
this.urlIndex = new Table(f, rowdef, EcoFSBufferSize, 0, useTailCache, exceed134217727, true);
|
||||
} catch (final SpaceExceededException e) {
|
||||
try {
|
||||
this.urlIndex = new Table(f, rowdef, 0, 0, false, exceed134217727, true);
|
||||
} catch (final SpaceExceededException e1) {
|
||||
ConcurrentLog.logException(e1);
|
||||
}
|
||||
}
|
||||
//urlIndex = new kelondroFlexTable(cachePath, tablename, -1, rowdef, 0, true);
|
||||
this.stack = new LinkedBlockingQueue<byte[]>();
|
||||
}
|
||||
|
||||
protected void clear() throws IOException {
|
||||
if (this.urlIndex != null) this.urlIndex.clear();
|
||||
if (this.stack != null) this.stack.clear();
|
||||
}
|
||||
|
||||
protected void close() {
|
||||
try {clear();} catch (final IOException e) {}
|
||||
if (this.urlIndex != null) this.urlIndex.close();
|
||||
}
|
||||
|
||||
public boolean remove(final byte[] hash) {
|
||||
if (hash == null) return false;
|
||||
//System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " remove " + hash);
|
||||
try {
|
||||
Iterator<byte[]> i = ZURL.this.stack.iterator();
|
||||
while (i.hasNext()) {
|
||||
byte[] b = i.next();
|
||||
if (NaturalOrder.naturalOrder.equal(hash, b)) i.remove();
|
||||
}
|
||||
return this.urlIndex.delete(hash);
|
||||
} catch (final IOException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public void removeHosts(final Iterable<byte[]> hosthashes, final boolean concurrent) {
|
||||
if (hosthashes == null) return;
|
||||
Thread t = new Thread() {
|
||||
public void run() {
|
||||
try {
|
||||
Iterator<byte[]> i = ZURL.this.urlIndex.keys(true, null);
|
||||
List<byte[]> r = new ArrayList<byte[]>();
|
||||
while (i.hasNext()) {
|
||||
byte[] b = i.next();
|
||||
for (byte[] hosthash: hosthashes) {
|
||||
if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) r.add(b);
|
||||
}
|
||||
}
|
||||
for (byte[] b: r) ZURL.this.urlIndex.remove(b);
|
||||
i = ZURL.this.stack.iterator();
|
||||
while (i.hasNext()) {
|
||||
byte[] b = i.next();
|
||||
for (byte[] hosthash: hosthashes) {
|
||||
if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) i.remove();
|
||||
}
|
||||
}
|
||||
} catch (final IOException e) {}
|
||||
}
|
||||
};
|
||||
if (concurrent) t.start(); else t.run();
|
||||
}
|
||||
|
||||
public void push(
|
||||
final Request bentry,
|
||||
final CrawlProfile profile,
|
||||
final byte[] executor,
|
||||
final Date workdate,
|
||||
final int workcount,
|
||||
final FailCategory failCategory,
|
||||
String anycause,
|
||||
final int httpcode) {
|
||||
// assert executor != null; // null == proxy !
|
||||
assert failCategory.store || httpcode == -1 : "failCategory=" + failCategory.name();
|
||||
if (exists(bentry.url().hash())) return; // don't insert double causes
|
||||
if (anycause == null) anycause = "unknown";
|
||||
final String reason = anycause + ((httpcode >= 0) ? " (http return code = " + httpcode + ")" : "");
|
||||
final Entry entry = new Entry(bentry, executor, workdate, workcount, reason);
|
||||
put(entry);
|
||||
this.stack.add(entry.hash());
|
||||
if (!reason.startsWith("double")) log.info(bentry.url().toNormalform(true) + " - " + reason);
|
||||
if (this.fulltext.getDefaultConnector() != null && failCategory.store) {
|
||||
// send the error to solr
|
||||
try {
|
||||
SolrInputDocument errorDoc = this.fulltext.getDefaultConfiguration().err(bentry.url(), profile == null ? null : profile.collections(), failCategory.name() + " " + reason, failCategory.failType, httpcode);
|
||||
this.fulltext.getDefaultConnector().add(errorDoc);
|
||||
} catch (final IOException e) {
|
||||
ConcurrentLog.warn("SOLR", "failed to send error " + bentry.url().toNormalform(true) + " to solr: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
while (this.stack.size() > maxStackSize) this.stack.poll();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<ZURL.Entry> iterator() {
|
||||
return new EntryIterator();
|
||||
}
|
||||
|
||||
public ArrayList<ZURL.Entry> list(int max) {
|
||||
final ArrayList<ZURL.Entry> l = new ArrayList<ZURL.Entry>();
|
||||
DigestURL url;
|
||||
for (final ZURL.Entry entry: this) {
|
||||
if (entry == null) continue;
|
||||
url = entry.url();
|
||||
if (url == null) continue;
|
||||
l.add(entry);
|
||||
if (max-- <= 0) l.remove(0);
|
||||
}
|
||||
return l;
|
||||
}
|
||||
|
||||
private class EntryIterator implements Iterator<ZURL.Entry> {
|
||||
private final Iterator<byte[]> hi;
|
||||
public EntryIterator() {
|
||||
this.hi = ZURL.this.stack.iterator();
|
||||
}
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return this.hi.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ZURL.Entry next() {
|
||||
return get(this.hi.next());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
this.hi.remove();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public ZURL.Entry get(final byte[] urlhash) {
|
||||
try {
|
||||
if (this.urlIndex == null) return null;
|
||||
// System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " get " + urlhash);
|
||||
final Row.Entry entry = this.urlIndex.get(urlhash, false);
|
||||
if (entry == null) return null;
|
||||
return new Entry(entry);
|
||||
} catch (final IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* private put (use push instead)
|
||||
* @param entry
|
||||
*/
|
||||
private void put(final Entry entry) {
|
||||
// stores the values from the object variables into the database
|
||||
if (entry.stored) return;
|
||||
if (entry.bentry == null) return;
|
||||
final Row.Entry newrow = rowdef.newEntry();
|
||||
newrow.setCol(0, entry.bentry.url().hash());
|
||||
newrow.setCol(1, entry.executor);
|
||||
newrow.setCol(2, entry.workdate.getTime());
|
||||
newrow.setCol(3, entry.workcount);
|
||||
newrow.setCol(4, UTF8.getBytes(entry.anycause));
|
||||
newrow.setCol(5, entry.bentry.toRow().bytes());
|
||||
try {
|
||||
if (this.urlIndex != null) this.urlIndex.put(newrow);
|
||||
entry.stored = true;
|
||||
} catch (final Exception e) {
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
}
|
||||
|
||||
boolean exists(final byte[] urlHash) {
|
||||
return this.urlIndex.has(urlHash);
|
||||
}
|
||||
|
||||
public void clearStack() {
|
||||
this.stack.clear();
|
||||
}
|
||||
|
||||
public int stackSize() {
|
||||
return this.stack.size();
|
||||
}
|
||||
|
||||
public class Entry {
|
||||
|
||||
private Request bentry; // the balancer entry
|
||||
private final byte[] executor; // the crawling executor
|
||||
private final Date workdate; // the time when the url was last time tried to load
|
||||
private final int workcount; // number of tryings
|
||||
private final String anycause; // string describing reason for load fail
|
||||
private boolean stored;
|
||||
|
||||
private Entry(
|
||||
final Request bentry,
|
||||
final byte[] executor,
|
||||
final Date workdate,
|
||||
final int workcount,
|
||||
final String anycause) {
|
||||
// create new entry
|
||||
assert bentry != null;
|
||||
// assert executor != null; // null == proxy !
|
||||
this.bentry = bentry;
|
||||
this.executor = executor;
|
||||
this.workdate = (workdate == null) ? new Date() : workdate;
|
||||
this.workcount = workcount;
|
||||
this.anycause = (anycause == null) ? "" : anycause;
|
||||
this.stored = false;
|
||||
}
|
||||
|
||||
private Entry(final Row.Entry entry) throws IOException {
|
||||
assert (entry != null);
|
||||
this.executor = entry.getColBytes(1, true);
|
||||
this.workdate = new Date(entry.getColLong(2));
|
||||
this.workcount = (int) entry.getColLong(3);
|
||||
this.anycause = entry.getColUTF8(4);
|
||||
this.bentry = new Request(Request.rowdef.newEntry(entry.getColBytes(5, false)));
|
||||
assert (Base64Order.enhancedCoder.equal(entry.getPrimaryKeyBytes(), this.bentry.url().hash()));
|
||||
this.stored = true;
|
||||
return;
|
||||
}
|
||||
|
||||
public DigestURL url() {
|
||||
return this.bentry.url();
|
||||
}
|
||||
|
||||
public byte[] initiator() {
|
||||
return this.bentry.initiator();
|
||||
}
|
||||
|
||||
private byte[] hash() {
|
||||
// return a url-hash, based on the md5 algorithm
|
||||
// the result is a String of 12 bytes within a 72-bit space
|
||||
// (each byte has an 6-bit range)
|
||||
// that should be enough for all web pages on the world
|
||||
return this.bentry.url().hash();
|
||||
}
|
||||
|
||||
public Date workdate() {
|
||||
return this.workdate;
|
||||
}
|
||||
|
||||
public byte[] executor() {
|
||||
// return the creator's hash
|
||||
return this.executor;
|
||||
}
|
||||
|
||||
public String anycause() {
|
||||
return this.anycause;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,173 @@
|
||||
/**
|
||||
* ErrorCache
|
||||
* Copyright 2013 by Michael Peter Christen
|
||||
* First released 17.10.2013 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.search.index;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashMap;
|
||||
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.client.solrj.SolrQuery.SortClause;
|
||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
import net.yacy.cora.document.encoding.ASCII;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.federate.solr.FailCategory;
|
||||
import net.yacy.cora.order.NaturalOrder;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.crawler.data.CrawlProfile;
|
||||
import net.yacy.search.index.Fulltext;
|
||||
import net.yacy.search.schema.CollectionConfiguration;
|
||||
import net.yacy.search.schema.CollectionSchema;
|
||||
|
||||
public class ErrorCache {
|
||||
|
||||
private static ConcurrentLog log = new ConcurrentLog("REJECTED");
|
||||
private static final int maxStackSize = 1000;
|
||||
|
||||
// the class object
|
||||
private final LinkedHashMap<String, CollectionConfiguration.FailDoc> stack;
|
||||
private final Fulltext fulltext;
|
||||
|
||||
public ErrorCache(final Fulltext fulltext) {
|
||||
this.fulltext = fulltext;
|
||||
this.stack = new LinkedHashMap<String, CollectionConfiguration.FailDoc>();
|
||||
try {
|
||||
// fill stack with latest values
|
||||
final SolrQuery params = new SolrQuery();
|
||||
params.setParam("defType", "edismax");
|
||||
params.setStart(0);
|
||||
params.setRows(100);
|
||||
params.setFacet(false);
|
||||
params.setSort(new SortClause(CollectionSchema.last_modified.getSolrFieldName(), SolrQuery.ORDER.desc));
|
||||
params.setFacet(false);
|
||||
params.setQuery(CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
|
||||
QueryResponse rsp = fulltext.getDefaultConnector().getResponseByParams(params);
|
||||
SolrDocumentList docList = rsp == null ? null : rsp.getResults();
|
||||
if (docList != null) for (int i = docList.size() - 1; i >= 0; i--) {
|
||||
CollectionConfiguration.FailDoc failDoc = new CollectionConfiguration.FailDoc(docList.get(i));
|
||||
this.stack.put(ASCII.String(failDoc.getDigestURL().hash()), failDoc);
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
}
|
||||
}
|
||||
|
||||
public void clear() throws IOException {
|
||||
if (this.stack != null) this.stack.clear();
|
||||
this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
|
||||
}
|
||||
|
||||
public void remove(final String hash) {
|
||||
if (hash == null) return;
|
||||
this.stack.remove(hash);
|
||||
try {
|
||||
this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + hash + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
|
||||
} catch (final IOException e) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
public void removeHosts(final Iterable<byte[]> hosthashes) {
|
||||
if (hosthashes == null) return;
|
||||
try {
|
||||
for (byte[] hosthash : hosthashes) {
|
||||
this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
|
||||
}
|
||||
Iterator<String> i = ErrorCache.this.stack.keySet().iterator();
|
||||
while (i.hasNext()) {
|
||||
String b = i.next();
|
||||
for (byte[] hosthash : hosthashes) {
|
||||
if (NaturalOrder.naturalOrder.equal(hosthash, 0, ASCII.getBytes(b), 6, 6)) i.remove();
|
||||
}
|
||||
}
|
||||
} catch (final IOException e) {
|
||||
}
|
||||
}
|
||||
|
||||
public void push(final DigestURL url, final CrawlProfile profile, final FailCategory failCategory, String anycause, final int httpcode) {
|
||||
// assert executor != null; // null == proxy !
|
||||
assert failCategory.store || httpcode == -1 : "failCategory=" + failCategory.name();
|
||||
if (exists(url.hash()))
|
||||
return; // don't insert double causes
|
||||
if (anycause == null) anycause = "unknown";
|
||||
final String reason = anycause + ((httpcode >= 0) ? " (http return code = " + httpcode + ")" : "");
|
||||
if (!reason.startsWith("double")) log.info(url.toNormalform(true) + " - " + reason);
|
||||
CollectionConfiguration.FailDoc failDoc = new CollectionConfiguration.FailDoc(
|
||||
url, profile == null ? null : profile.collections(),
|
||||
failCategory.name() + " " + reason, failCategory.failType,
|
||||
httpcode);
|
||||
this.stack.put(ASCII.String(url.hash()), failDoc);
|
||||
if (this.fulltext.getDefaultConnector() != null && failCategory.store) {
|
||||
// send the error to solr
|
||||
try {
|
||||
SolrInputDocument errorDoc = failDoc.toSolr(this.fulltext.getDefaultConfiguration());
|
||||
this.fulltext.getDefaultConnector().add(errorDoc);
|
||||
} catch (final IOException e) {
|
||||
ConcurrentLog.warn("SOLR", "failed to send error " + url.toNormalform(true) + " to solr: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
while (this.stack.size() > maxStackSize)
|
||||
this.stack.remove(this.stack.keySet().iterator());
|
||||
}
|
||||
|
||||
public ArrayList<CollectionConfiguration.FailDoc> list(int max) {
|
||||
final ArrayList<CollectionConfiguration.FailDoc> l = new ArrayList<CollectionConfiguration.FailDoc>();
|
||||
Iterator<CollectionConfiguration.FailDoc> fdi = this.stack.values().iterator();
|
||||
for (int i = 0; i < this.stack.size() - max; i++) fdi.next();
|
||||
while (fdi.hasNext()) l.add(fdi.next());
|
||||
return l;
|
||||
}
|
||||
|
||||
public CollectionConfiguration.FailDoc get(final String urlhash) {
|
||||
CollectionConfiguration.FailDoc fd = this.stack.get(urlhash);
|
||||
if (fd != null) return fd;
|
||||
try {
|
||||
SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(urlhash);
|
||||
if (doc == null) return null;
|
||||
return new CollectionConfiguration.FailDoc(doc);
|
||||
} catch (final IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean exists(final byte[] urlHash) {
|
||||
try {
|
||||
return this.fulltext.getDefaultConnector().existsByQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + ASCII.String(urlHash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
|
||||
} catch (IOException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public void clearStack() {
|
||||
this.stack.clear();
|
||||
}
|
||||
|
||||
public int stackSize() {
|
||||
return this.stack.size();
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in new issue