free configuration of postprocessing clickdepth maximum depth and time

pull/1/head
Michael Peter Christen 11 years ago
parent 39b641d6cd
commit 63c9fcf3e0

@ -288,7 +288,7 @@ public class DigestURL extends MultiProtocolURL implements Serializable {
return Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(sb.toString())).charAt(0); return Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(sb.toString())).charAt(0);
} }
public final static Pattern rootPattern = Pattern.compile("/|/index.htm(l?)|/index.php|/home.htm(l?)|/home.php|/default.htm(l?)|/default.php"); public final static Pattern rootPattern = Pattern.compile("/|/\\?|/index.htm(l?)|/index.php|/home.htm(l?)|/home.php|/default.htm(l?)|/default.php");
public final boolean probablyRootURL() { public final boolean probablyRootURL() {
return this.path.length() <= 1 || rootPattern.matcher(this.path).matches(); return this.path.length() <= 1 || rootPattern.matcher(this.path).matches();

@ -178,12 +178,12 @@ public class SchemaConfiguration extends Configuration implements Serializable {
return changed; return changed;
} }
public boolean postprocessing_clickdepth(final ClickdepthCache clickdepthCache, final SolrInputDocument sid, final DigestURL url, final SchemaDeclaration clickdepthfield, final int maxtime) { public boolean postprocessing_clickdepth(final ClickdepthCache clickdepthCache, final SolrInputDocument sid, final DigestURL url, final SchemaDeclaration clickdepthfield) {
// get new click depth and compare with old // get new click depth and compare with old
Integer oldclickdepth = (Integer) sid.getFieldValue(clickdepthfield.getSolrFieldName()); Integer oldclickdepth = (Integer) sid.getFieldValue(clickdepthfield.getSolrFieldName());
if (oldclickdepth != null && oldclickdepth.intValue() != 999) return false; // we do not want to compute that again if (oldclickdepth != null && oldclickdepth.intValue() != 999) return false; // we do not want to compute that again
try { try {
int clickdepth = clickdepthCache.getClickdepth(url, maxtime); int clickdepth = clickdepthCache.getClickdepth(url);
if (oldclickdepth == null || oldclickdepth.intValue() != clickdepth) { if (oldclickdepth == null || oldclickdepth.intValue() != clickdepth) {
sid.setField(clickdepthfield.getSolrFieldName(), clickdepth); sid.setField(clickdepthfield.getSolrFieldName(), clickdepth);
return true; return true;

@ -541,12 +541,15 @@ public final class Switchboard extends serverSwitch {
// load domainList // load domainList
try { try {
this.domainList = null; this.domainList = null;
if ( !getConfig("network.unit.domainlist", "").equals("") ) { if (!getConfig("network.unit.domainlist", "").equals("")) {
final Reader r = final Reader r = getConfigFileFromWebOrLocally(
getConfigFileFromWebOrLocally(getConfig("network.unit.domainlist", ""), getAppPath() getConfig("network.unit.domainlist", ""),
.getAbsolutePath(), new File(this.networkRoot, "domainlist.txt")); getAppPath().getAbsolutePath(),
new File(this.networkRoot, "domainlist.txt"));
this.domainList = new FilterEngine(); this.domainList = new FilterEngine();
this.domainList.loadList(new BufferedReader(r), null); BufferedReader br = new BufferedReader(r);
this.domainList.loadList(br, null);
br.close();
} }
} catch (final FileNotFoundException e ) { } catch (final FileNotFoundException e ) {
this.log.severe("CONFIG: domainlist not found: " + e.getMessage()); this.log.severe("CONFIG: domainlist not found: " + e.getMessage());
@ -1382,11 +1385,14 @@ public final class Switchboard extends serverSwitch {
try { try {
this.domainList = null; this.domainList = null;
if ( !getConfig("network.unit.domainlist", "").equals("") ) { if ( !getConfig("network.unit.domainlist", "").equals("") ) {
final Reader r = final Reader r = getConfigFileFromWebOrLocally(
getConfigFileFromWebOrLocally(getConfig("network.unit.domainlist", ""), getAppPath() getConfig("network.unit.domainlist", ""),
.getAbsolutePath(), new File(this.networkRoot, "domainlist.txt")); getAppPath().getAbsolutePath(),
new File(this.networkRoot, "domainlist.txt"));
this.domainList = new FilterEngine(); this.domainList = new FilterEngine();
this.domainList.loadList(new BufferedReader(r), null); BufferedReader br = new BufferedReader(r);
this.domainList.loadList(br, null);
br.close();
} }
} catch (final FileNotFoundException e ) { } catch (final FileNotFoundException e ) {
this.log.severe("CONFIG: domainlist not found: " + e.getMessage()); this.log.severe("CONFIG: domainlist not found: " + e.getMessage());
@ -1858,8 +1864,9 @@ public final class Switchboard extends serverSwitch {
} }
return moved; return moved;
} }
InputStream is = null;
try { try {
InputStream is = new BufferedInputStream(new FileInputStream(infile)); is = new BufferedInputStream(new FileInputStream(infile));
if ( s.endsWith(".gz") ) { if ( s.endsWith(".gz") ) {
is = new GZIPInputStream(is); is = new GZIPInputStream(is);
} }
@ -1877,8 +1884,10 @@ public final class Switchboard extends serverSwitch {
try { try {
final OutputStream os = final OutputStream os =
new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(gzfile))); new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(gzfile)));
FileUtils.copy(new BufferedInputStream(new FileInputStream(outfile)), os); BufferedInputStream bis = new BufferedInputStream(new FileInputStream(outfile));
FileUtils.copy(bis, os);
os.close(); os.close();
bis.close();
if ( gzfile.exists() ) { if ( gzfile.exists() ) {
FileUtils.deletedelete(outfile); FileUtils.deletedelete(outfile);
} }
@ -1890,6 +1899,7 @@ public final class Switchboard extends serverSwitch {
} }
} }
} }
if (is != null) try {is.close();} catch (IOException e) {}
} }
return moved; return moved;
} }
@ -2296,7 +2306,9 @@ public final class Switchboard extends serverSwitch {
// we optimize first because that is useful for postprocessing // we optimize first because that is useful for postprocessing
int proccount = 0; int proccount = 0;
ReferenceReportCache rrCache = index.getReferenceReportCache(); ReferenceReportCache rrCache = index.getReferenceReportCache();
ClickdepthCache clickdepthCache = index.getClickdepthCache(rrCache); int clickdepth_maxtime = this.getConfigInt("postprocessing.clickdepth.maxtime", 100);
int clickdepth_maxdepth = this.getConfigInt("postprocessing.clickdepth.maxdepth", 6);
ClickdepthCache clickdepthCache = index.getClickdepthCache(rrCache, clickdepth_maxtime, clickdepth_maxdepth);
Set<String> deletionCandidates = collection1Configuration.contains(CollectionSchema.harvestkey_s.getSolrFieldName()) ? Set<String> deletionCandidates = collection1Configuration.contains(CollectionSchema.harvestkey_s.getSolrFieldName()) ?
this.crawler.getFinishesProfiles(this.crawlQueues) : new HashSet<String>(); this.crawler.getFinishesProfiles(this.crawlQueues) : new HashSet<String>();
int cleanupByHarvestkey = deletionCandidates.size(); int cleanupByHarvestkey = deletionCandidates.size();

@ -241,7 +241,9 @@ public final class Fulltext {
public long collectionSize() { public long collectionSize() {
long t = System.currentTimeMillis(); long t = System.currentTimeMillis();
if (t - this.collectionSizeLastAccess < 1000) return this.collectionSizeLastValue; if (t - this.collectionSizeLastAccess < 1000) return this.collectionSizeLastValue;
long size = this.solrInstances.getDefaultMirrorConnector().getSize(); SolrConnector sc = this.solrInstances.getDefaultMirrorConnector();
if (sc == null) return 0;
long size = sc.getSize();
this.collectionSizeLastAccess = t; this.collectionSizeLastAccess = t;
this.collectionSizeLastValue = size; this.collectionSizeLastValue = size;
return size; return size;

@ -209,20 +209,20 @@ public class Segment {
* @return the clickdepth level or 999 if the root url cannot be found or a recursion limit is reached * @return the clickdepth level or 999 if the root url cannot be found or a recursion limit is reached
* @throws IOException * @throws IOException
*/ */
private int getClickDepth(ReferenceReportCache rrc, final DigestURL url, int maxtime) throws IOException { private int getClickDepth(final ReferenceReportCache rrc, final DigestURL url, final int maxtime, final int maxdepth) throws IOException {
final byte[] searchhash = url.hash(); final byte[] searchhash = url.hash();
RowHandleSet rootCandidates = getPossibleRootHashes(url); RowHandleSet rootCandidates = getPossibleRootHashes(url);
if (rootCandidates.has(searchhash)) return 0; // the url is a root candidate itself
Set<String> ignore = new HashSet<String>(); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops Set<String> ignore = new HashSet<String>(); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent endless loops
Set<String> levelhashes = new HashSet<String>(); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry Set<String> levelhashes = new HashSet<String>(); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
levelhashes.add(ASCII.String(searchhash)); levelhashes.add(ASCII.String(searchhash));
int leveldepth = 0; // the recursion depth and therefore the result depth-1. Shall be 0 for the first call
final byte[] hosthash = new byte[6]; // the host of the url to be checked final byte[] hosthash = new byte[6]; // the host of the url to be checked
System.arraycopy(searchhash, 6, hosthash, 0, 6); System.arraycopy(searchhash, 6, hosthash, 0, 6);
long timeout = System.currentTimeMillis() + maxtime; long timeout = System.currentTimeMillis() + maxtime;
mainloop: for (int maxdepth = 0; maxdepth < 6 && System.currentTimeMillis() < timeout; maxdepth++) { mainloop: for (int leveldepth = 0; leveldepth < maxdepth && System.currentTimeMillis() < timeout; leveldepth++) {
Set<String> checknext = new HashSet<String>(); Set<String> checknext = new HashSet<String>();
@ -254,15 +254,14 @@ public class Segment {
} }
if (System.currentTimeMillis() > timeout) break mainloop; if (System.currentTimeMillis() > timeout) break mainloop;
} }
leveldepth++;
levelhashes = checknext; levelhashes = checknext;
} }
return 999; return 999;
} }
private static RowHandleSet getPossibleRootHashes(DigestURL url) { private static RowHandleSet getPossibleRootHashes(final DigestURL url) {
RowHandleSet rootCandidates = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10); RowHandleSet rootCandidates = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10);
String rootStub = url.getProtocol() + "://" + url.getHost(); String rootStub = url.getProtocol() + "://" + url.getHost() + (url.getProtocol().equals("http") && url.getPort() != 80 ? (":" + url.getPort()) : "");
try { try {
rootCandidates.put(new DigestURL(rootStub).hash()); rootCandidates.put(new DigestURL(rootStub).hash());
rootCandidates.put(new DigestURL(rootStub + "/").hash()); rootCandidates.put(new DigestURL(rootStub + "/").hash());
@ -277,6 +276,7 @@ public class Segment {
rootCandidates.put(new DigestURL(rootStub + "/default.php").hash()); rootCandidates.put(new DigestURL(rootStub + "/default.php").hash());
rootCandidates.optimize(); rootCandidates.optimize();
} catch (final Throwable e) {} } catch (final Throwable e) {}
rootCandidates.optimize();
return rootCandidates; return rootCandidates;
} }
@ -304,25 +304,29 @@ public class Segment {
} }
} }
public ClickdepthCache getClickdepthCache(ReferenceReportCache rrc) { public ClickdepthCache getClickdepthCache(ReferenceReportCache rrc, final int maxtime, final int maxdepth) {
return new ClickdepthCache(rrc); return new ClickdepthCache(rrc, maxtime, maxdepth);
} }
public class ClickdepthCache { public class ClickdepthCache {
final ReferenceReportCache rrc; private final ReferenceReportCache rrc;
final Map<String, Integer> cache; private final Map<String, Integer> cache;
public ClickdepthCache(ReferenceReportCache rrc) { public final int maxdepth; // maximum clickdepth
public final int maxtime; // maximum time to compute clickdepth
public ClickdepthCache(final ReferenceReportCache rrc, final int maxtime, final int maxdepth) {
this.rrc = rrc; this.rrc = rrc;
this.cache = new ConcurrentHashMap<String, Integer>(); this.cache = new ConcurrentHashMap<String, Integer>();
this.maxdepth = maxdepth;
this.maxtime = maxtime;
} }
public int getClickdepth(final DigestURL url, int maxtime) throws IOException { public int getClickdepth(final DigestURL url) throws IOException {
Integer clickdepth = cache.get(ASCII.String(url.hash())); Integer clickdepth = cache.get(ASCII.String(url.hash()));
if (MemoryControl.shortStatus()) cache.clear(); if (MemoryControl.shortStatus()) cache.clear();
if (clickdepth != null) { if (clickdepth != null) {
//ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth + " CACHE HIT"); //ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth + " CACHE HIT");
return clickdepth.intValue(); return clickdepth.intValue();
} }
clickdepth = Segment.this.getClickDepth(this.rrc, url, maxtime); clickdepth = Segment.this.getClickDepth(this.rrc, url, this.maxtime, this.maxdepth);
//ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth); //ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth);
this.cache.put(ASCII.String(url.hash()), clickdepth); this.cache.put(ASCII.String(url.hash()), clickdepth);
return clickdepth.intValue(); return clickdepth.intValue();

@ -1085,7 +1085,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
id = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()); id = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName());
try { try {
url = new DigestURL(protocol + "://" + urlstub, ASCII.getBytes(id)); url = new DigestURL(protocol + "://" + urlstub, ASCII.getBytes(id));
postprocessing_clickdepth(clickdepthCache, sid, url, WebgraphSchema.source_clickdepth_i, 100); postprocessing_clickdepth(clickdepthCache, sid, url, WebgraphSchema.source_clickdepth_i);
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
} }
} }
@ -1095,7 +1095,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
id = (String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName()); id = (String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName());
try { try {
url = new DigestURL(protocol + "://" + urlstub, ASCII.getBytes(id)); url = new DigestURL(protocol + "://" + urlstub, ASCII.getBytes(id));
postprocessing_clickdepth(clickdepthCache, sid, url, WebgraphSchema.target_clickdepth_i, 100); postprocessing_clickdepth(clickdepthCache, sid, url, WebgraphSchema.target_clickdepth_i);
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
} }
} }
@ -1167,7 +1167,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// switch over tag types // switch over tag types
ProcessType tagtype = ProcessType.valueOf((String) tag); ProcessType tagtype = ProcessType.valueOf((String) tag);
if (tagtype == ProcessType.CLICKDEPTH && collection.contains(CollectionSchema.clickdepth_i)) { if (tagtype == ProcessType.CLICKDEPTH && collection.contains(CollectionSchema.clickdepth_i)) {
if (postprocessing_clickdepth(clickdepthCache, sid, url, CollectionSchema.clickdepth_i, 100)) proccount_clickdepthchange++; if (postprocessing_clickdepth(clickdepthCache, sid, url, CollectionSchema.clickdepth_i)) proccount_clickdepthchange++;
} }
if (tagtype == ProcessType.CITATION && if (tagtype == ProcessType.CITATION &&

Loading…
Cancel
Save