free configuration of postprocessing clickdepth maximum depth and time

pull/1/head
Michael Peter Christen 11 years ago
parent 39b641d6cd
commit 63c9fcf3e0

@ -288,7 +288,7 @@ public class DigestURL extends MultiProtocolURL implements Serializable {
return Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(sb.toString())).charAt(0);
}
public final static Pattern rootPattern = Pattern.compile("/|/index.htm(l?)|/index.php|/home.htm(l?)|/home.php|/default.htm(l?)|/default.php");
public final static Pattern rootPattern = Pattern.compile("/|/\\?|/index.htm(l?)|/index.php|/home.htm(l?)|/home.php|/default.htm(l?)|/default.php");
public final boolean probablyRootURL() {
return this.path.length() <= 1 || rootPattern.matcher(this.path).matches();

@ -178,12 +178,12 @@ public class SchemaConfiguration extends Configuration implements Serializable {
return changed;
}
public boolean postprocessing_clickdepth(final ClickdepthCache clickdepthCache, final SolrInputDocument sid, final DigestURL url, final SchemaDeclaration clickdepthfield, final int maxtime) {
public boolean postprocessing_clickdepth(final ClickdepthCache clickdepthCache, final SolrInputDocument sid, final DigestURL url, final SchemaDeclaration clickdepthfield) {
// get new click depth and compare with old
Integer oldclickdepth = (Integer) sid.getFieldValue(clickdepthfield.getSolrFieldName());
if (oldclickdepth != null && oldclickdepth.intValue() != 999) return false; // we do not want to compute that again
try {
int clickdepth = clickdepthCache.getClickdepth(url, maxtime);
int clickdepth = clickdepthCache.getClickdepth(url);
if (oldclickdepth == null || oldclickdepth.intValue() != clickdepth) {
sid.setField(clickdepthfield.getSolrFieldName(), clickdepth);
return true;

@ -542,11 +542,14 @@ public final class Switchboard extends serverSwitch {
try {
this.domainList = null;
if (!getConfig("network.unit.domainlist", "").equals("")) {
final Reader r =
getConfigFileFromWebOrLocally(getConfig("network.unit.domainlist", ""), getAppPath()
.getAbsolutePath(), new File(this.networkRoot, "domainlist.txt"));
final Reader r = getConfigFileFromWebOrLocally(
getConfig("network.unit.domainlist", ""),
getAppPath().getAbsolutePath(),
new File(this.networkRoot, "domainlist.txt"));
this.domainList = new FilterEngine();
this.domainList.loadList(new BufferedReader(r), null);
BufferedReader br = new BufferedReader(r);
this.domainList.loadList(br, null);
br.close();
}
} catch (final FileNotFoundException e ) {
this.log.severe("CONFIG: domainlist not found: " + e.getMessage());
@ -1382,11 +1385,14 @@ public final class Switchboard extends serverSwitch {
try {
this.domainList = null;
if ( !getConfig("network.unit.domainlist", "").equals("") ) {
final Reader r =
getConfigFileFromWebOrLocally(getConfig("network.unit.domainlist", ""), getAppPath()
.getAbsolutePath(), new File(this.networkRoot, "domainlist.txt"));
final Reader r = getConfigFileFromWebOrLocally(
getConfig("network.unit.domainlist", ""),
getAppPath().getAbsolutePath(),
new File(this.networkRoot, "domainlist.txt"));
this.domainList = new FilterEngine();
this.domainList.loadList(new BufferedReader(r), null);
BufferedReader br = new BufferedReader(r);
this.domainList.loadList(br, null);
br.close();
}
} catch (final FileNotFoundException e ) {
this.log.severe("CONFIG: domainlist not found: " + e.getMessage());
@ -1858,8 +1864,9 @@ public final class Switchboard extends serverSwitch {
}
return moved;
}
InputStream is = null;
try {
InputStream is = new BufferedInputStream(new FileInputStream(infile));
is = new BufferedInputStream(new FileInputStream(infile));
if ( s.endsWith(".gz") ) {
is = new GZIPInputStream(is);
}
@ -1877,8 +1884,10 @@ public final class Switchboard extends serverSwitch {
try {
final OutputStream os =
new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(gzfile)));
FileUtils.copy(new BufferedInputStream(new FileInputStream(outfile)), os);
BufferedInputStream bis = new BufferedInputStream(new FileInputStream(outfile));
FileUtils.copy(bis, os);
os.close();
bis.close();
if ( gzfile.exists() ) {
FileUtils.deletedelete(outfile);
}
@ -1890,6 +1899,7 @@ public final class Switchboard extends serverSwitch {
}
}
}
if (is != null) try {is.close();} catch (IOException e) {}
}
return moved;
}
@ -2296,7 +2306,9 @@ public final class Switchboard extends serverSwitch {
// we optimize first because that is useful for postprocessing
int proccount = 0;
ReferenceReportCache rrCache = index.getReferenceReportCache();
ClickdepthCache clickdepthCache = index.getClickdepthCache(rrCache);
int clickdepth_maxtime = this.getConfigInt("postprocessing.clickdepth.maxtime", 100);
int clickdepth_maxdepth = this.getConfigInt("postprocessing.clickdepth.maxdepth", 6);
ClickdepthCache clickdepthCache = index.getClickdepthCache(rrCache, clickdepth_maxtime, clickdepth_maxdepth);
Set<String> deletionCandidates = collection1Configuration.contains(CollectionSchema.harvestkey_s.getSolrFieldName()) ?
this.crawler.getFinishesProfiles(this.crawlQueues) : new HashSet<String>();
int cleanupByHarvestkey = deletionCandidates.size();

@ -241,7 +241,9 @@ public final class Fulltext {
public long collectionSize() {
long t = System.currentTimeMillis();
if (t - this.collectionSizeLastAccess < 1000) return this.collectionSizeLastValue;
long size = this.solrInstances.getDefaultMirrorConnector().getSize();
SolrConnector sc = this.solrInstances.getDefaultMirrorConnector();
if (sc == null) return 0;
long size = sc.getSize();
this.collectionSizeLastAccess = t;
this.collectionSizeLastValue = size;
return size;

@ -209,20 +209,20 @@ public class Segment {
* @return the clickdepth level or 999 if the root url cannot be found or a recursion limit is reached
* @throws IOException
*/
private int getClickDepth(ReferenceReportCache rrc, final DigestURL url, int maxtime) throws IOException {
private int getClickDepth(final ReferenceReportCache rrc, final DigestURL url, final int maxtime, final int maxdepth) throws IOException {
final byte[] searchhash = url.hash();
RowHandleSet rootCandidates = getPossibleRootHashes(url);
if (rootCandidates.has(searchhash)) return 0; // the url is a root candidate itself
Set<String> ignore = new HashSet<String>(); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops
Set<String> ignore = new HashSet<String>(); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent endless loops
Set<String> levelhashes = new HashSet<String>(); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
levelhashes.add(ASCII.String(searchhash));
int leveldepth = 0; // the recursion depth and therefore the result depth-1. Shall be 0 for the first call
final byte[] hosthash = new byte[6]; // the host of the url to be checked
System.arraycopy(searchhash, 6, hosthash, 0, 6);
long timeout = System.currentTimeMillis() + maxtime;
mainloop: for (int maxdepth = 0; maxdepth < 6 && System.currentTimeMillis() < timeout; maxdepth++) {
mainloop: for (int leveldepth = 0; leveldepth < maxdepth && System.currentTimeMillis() < timeout; leveldepth++) {
Set<String> checknext = new HashSet<String>();
@ -254,15 +254,14 @@ public class Segment {
}
if (System.currentTimeMillis() > timeout) break mainloop;
}
leveldepth++;
levelhashes = checknext;
}
return 999;
}
private static RowHandleSet getPossibleRootHashes(DigestURL url) {
private static RowHandleSet getPossibleRootHashes(final DigestURL url) {
RowHandleSet rootCandidates = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10);
String rootStub = url.getProtocol() + "://" + url.getHost();
String rootStub = url.getProtocol() + "://" + url.getHost() + (url.getProtocol().equals("http") && url.getPort() != 80 ? (":" + url.getPort()) : "");
try {
rootCandidates.put(new DigestURL(rootStub).hash());
rootCandidates.put(new DigestURL(rootStub + "/").hash());
@ -277,6 +276,7 @@ public class Segment {
rootCandidates.put(new DigestURL(rootStub + "/default.php").hash());
rootCandidates.optimize();
} catch (final Throwable e) {}
rootCandidates.optimize();
return rootCandidates;
}
@ -304,25 +304,29 @@ public class Segment {
}
}
public ClickdepthCache getClickdepthCache(ReferenceReportCache rrc) {
return new ClickdepthCache(rrc);
public ClickdepthCache getClickdepthCache(ReferenceReportCache rrc, final int maxtime, final int maxdepth) {
return new ClickdepthCache(rrc, maxtime, maxdepth);
}
public class ClickdepthCache {
final ReferenceReportCache rrc;
final Map<String, Integer> cache;
public ClickdepthCache(ReferenceReportCache rrc) {
private final ReferenceReportCache rrc;
private final Map<String, Integer> cache;
public final int maxdepth; // maximum clickdepth
public final int maxtime; // maximum time to compute clickdepth
public ClickdepthCache(final ReferenceReportCache rrc, final int maxtime, final int maxdepth) {
this.rrc = rrc;
this.cache = new ConcurrentHashMap<String, Integer>();
this.maxdepth = maxdepth;
this.maxtime = maxtime;
}
public int getClickdepth(final DigestURL url, int maxtime) throws IOException {
public int getClickdepth(final DigestURL url) throws IOException {
Integer clickdepth = cache.get(ASCII.String(url.hash()));
if (MemoryControl.shortStatus()) cache.clear();
if (clickdepth != null) {
//ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth + " CACHE HIT");
return clickdepth.intValue();
}
clickdepth = Segment.this.getClickDepth(this.rrc, url, maxtime);
clickdepth = Segment.this.getClickDepth(this.rrc, url, this.maxtime, this.maxdepth);
//ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth);
this.cache.put(ASCII.String(url.hash()), clickdepth);
return clickdepth.intValue();

@ -1085,7 +1085,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
id = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName());
try {
url = new DigestURL(protocol + "://" + urlstub, ASCII.getBytes(id));
postprocessing_clickdepth(clickdepthCache, sid, url, WebgraphSchema.source_clickdepth_i, 100);
postprocessing_clickdepth(clickdepthCache, sid, url, WebgraphSchema.source_clickdepth_i);
} catch (MalformedURLException e) {
}
}
@ -1095,7 +1095,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
id = (String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName());
try {
url = new DigestURL(protocol + "://" + urlstub, ASCII.getBytes(id));
postprocessing_clickdepth(clickdepthCache, sid, url, WebgraphSchema.target_clickdepth_i, 100);
postprocessing_clickdepth(clickdepthCache, sid, url, WebgraphSchema.target_clickdepth_i);
} catch (MalformedURLException e) {
}
}
@ -1167,7 +1167,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// switch over tag types
ProcessType tagtype = ProcessType.valueOf((String) tag);
if (tagtype == ProcessType.CLICKDEPTH && collection.contains(CollectionSchema.clickdepth_i)) {
if (postprocessing_clickdepth(clickdepthCache, sid, url, CollectionSchema.clickdepth_i, 100)) proccount_clickdepthchange++;
if (postprocessing_clickdepth(clickdepthCache, sid, url, CollectionSchema.clickdepth_i)) proccount_clickdepthchange++;
}
if (tagtype == ProcessType.CITATION &&

Loading…
Cancel
Save