Merge branch 'master' into crawlexpert-post

pull/1/head
Jens Bertram 11 years ago
commit 85316b3ac6

@ -74,6 +74,10 @@ source_id_s
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)
#source_clickdepth_i
## copy of the citation rank norm value from the source link
source_cr_host_norm_i
## host of the url (source)
#source_host_s
@ -171,6 +175,10 @@ target_path_folders_sxt
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)
#target_clickdepth_i
## copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host
target_cr_host_norm_i
## host of the url (target)
#target_host_s

@ -24,6 +24,8 @@ import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import org.openjena.atlas.logging.Log;
import net.yacy.cora.util.CommonPattern;
import net.yacy.search.schema.CollectionSchema;
@ -75,16 +77,22 @@ public class Ranking {
* @param boostDef the definition string
*/
public void updateBoosts(String boostDef) {
// call i.e. with "sku^20.0,url_paths_sxt^20.0,title^15.0,h1_txt^11.0,h2_txt^10.0,author^8.0,description^5.0,keywords^2.0,text_t^1.0,fuzzy_signature_unique_b^100000.0"
// call i.e. with "sku^20.0,url_paths_sxt^20.0,title^15.0,h1_txt^11.0,h2_txt^10.0,author^8.0,description_txt^5.0,keywords^2.0,text_t^1.0,fuzzy_signature_unique_b^100000.0"
if (boostDef == null || boostDef.length() == 0) return;
String[] bf = CommonPattern.COMMA.split(boostDef);
this.fieldBoosts.clear();
for (String boost: bf) {
int p = boost.indexOf('^');
if (p < 0) continue;
CollectionSchema field = CollectionSchema.valueOf(boost.substring(0, p));
Float factor = Float.parseFloat(boost.substring(p + 1));
this.fieldBoosts.put(field, factor);
String boostkey = boost.substring(0, p);
try {
CollectionSchema field = CollectionSchema.valueOf(boostkey);
Float factor = Float.parseFloat(boost.substring(p + 1));
this.fieldBoosts.put(field, factor);
} catch (IllegalArgumentException e) {
// boostkey is unknown; ignore it but print warning
Log.warn("Ranking", "unknwon boost key '" + boostkey + "'");
}
}
}

@ -33,6 +33,7 @@ import java.util.Set;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
@ -78,6 +79,34 @@ public class SchemaConfiguration extends Configuration implements Serializable {
}
}
/**
* Convert a SolrDocument to a SolrInputDocument.
* This is useful if a document from the search index shall be modified and indexed again.
* This shall be used as replacement of ClientUtils.toSolrInputDocument because we remove some fields
* which are created automatically during the indexing process.
* @param doc the solr document
* @return a solr input document
*/
public SolrInputDocument toSolrInputDocument(final SolrDocument doc, Set<String> omitFields) {
SolrInputDocument sid = new SolrInputDocument();
for (String name: doc.getFieldNames()) {
if (this.contains(name) && (omitFields == null || !omitFields.contains(name))) { // check each field if enabled in local Solr schema
sid.addField(name, doc.getFieldValue(name), 1.0f);
}
}
return sid;
}
public SolrDocument toSolrDocument(final SolrInputDocument doc, Set<String> omitFields) {
SolrDocument sd = new SolrDocument();
for (SolrInputField field: doc) {
if (this.contains(field.getName()) && (omitFields == null || !omitFields.contains(field.getName()))) { // check each field if enabled in local Solr schema
sd.setField(field.getName(), field.getValue());
}
}
return sd;
}
public boolean postprocessing_doublecontent(Segment segment, Set<String> uniqueURLs, SolrInputDocument sid, DigestURL url) {
boolean changed = false;
// FIND OUT IF THIS IS A DOUBLE DOCUMENT

@ -48,8 +48,6 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.data.ResultURLs;
import net.yacy.crawler.data.ResultURLs.EventOrigin;
import net.yacy.crawler.retrieval.FTPLoader;
import net.yacy.crawler.retrieval.HTTPLoader;
import net.yacy.crawler.retrieval.Request;

@ -589,7 +589,7 @@ public class Response {
// -if-modified-since in request
// if the page is fresh at the very moment we can index it
final Date ifModifiedSince = this.requestHeader.ifModifiedSince();
final Date ifModifiedSince = this.ifModifiedSince();
if ((ifModifiedSince != null) && (this.responseHeader.containsKey(HeaderFramework.LAST_MODIFIED))) {
// parse date
Date d = this.responseHeader.lastModified();

@ -84,8 +84,8 @@ import net.yacy.search.query.QueryParams;
import net.yacy.search.schema.WebgraphConfiguration.Subgraph;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
public class CollectionConfiguration extends SchemaConfiguration implements Serializable {
@ -169,32 +169,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
omitFields.add(CollectionSchema.coordinate_p_1_coordinate.getSolrFieldName());
}
/**
* Convert a SolrDocument to a SolrInputDocument.
* This is useful if a document from the search index shall be modified and indexed again.
* This shall be used as replacement of ClientUtils.toSolrInputDocument because we remove some fields
* which are created automatically during the indexing process.
* @param doc the solr document
* @return a solr input document
*/
public SolrInputDocument toSolrInputDocument(final SolrDocument doc) {
SolrInputDocument sid = new SolrInputDocument();
for (String name: doc.getFieldNames()) {
if (this.contains(name) && !omitFields.contains(name)) { // check each field if enabled in local Solr schema
sid.addField(name, doc.getFieldValue(name), 1.0f);
}
}
return sid;
return toSolrInputDocument(doc, omitFields);
}
public SolrDocument toSolrDocument(final SolrInputDocument doc) {
SolrDocument sd = new SolrDocument();
for (SolrInputField field: doc) {
if (this.contains(field.getName()) && !omitFields.contains(field.getName())) { // check each field if enabled in local Solr schema
sd.setField(field.getName(), field.getValue());
}
}
return sd;
return toSolrDocument(doc, omitFields);
}
/**
@ -691,7 +671,23 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// canonical tag
if (allAttr || contains(CollectionSchema.canonical_s)) {
final DigestURL canonical = html.getCanonical();
DigestURL canonical = html.getCanonical();
// if there is no canonical in the html then look into the http header:
if (canonical == null) {
String link = responseHeader.get("Link", null);
int p;
if (link != null && ((p = link.indexOf("rel=\"canonical\"")) > 0)) {
link = link.substring(0, p).trim();
p = link.indexOf('<');
int q = link.lastIndexOf('>');
if (p >= 0 && q > 0) {
link = link.substring(p + 1, q);
try {
canonical = new DigestURL(link);
} catch (MalformedURLException e) {}
}
}
}
if (canonical != null && !ASCII.String(canonical.hash()).equals(id)) {
containsCanonical = true;
inboundLinks.remove(canonical);
@ -888,16 +884,18 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
public int postprocessing(final Segment segment, String harvestkey) {
if (!this.contains(CollectionSchema.process_sxt)) return 0;
if (!segment.connectedCitation()) return 0;
SolrConnector connector = segment.fulltext().getDefaultConnector();
connector.commit(true); // make sure that we have latest information that can be found
SolrConnector collectionConnector = segment.fulltext().getDefaultConnector();
SolrConnector webgraphConnector = segment.fulltext().getWebgraphConnector();
collectionConnector.commit(true); // make sure that we have latest information that can be found
ReferenceReportCache rrCache = segment.getReferenceReportCache();
Map<byte[], CRV> ranking = new TreeMap<byte[], CRV>(Base64Order.enhancedCoder);
ReversibleScoreMap<String> hostscore = null;
try {
// collect hosts from index which shall take part in citation computation
ReversibleScoreMap<String> hostscore = connector.getFacets(
hostscore = collectionConnector.getFacets(
(harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(),
10000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
if (hostscore == null) hostscore = new ClusteredScoreMap<String>();
// for each host, do a citation rank computation
for (String host: hostscore.keyList(true)) {
@ -915,14 +913,49 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
ranking.putAll(crn); // accumulate this here for usage in document update later
}
} catch (final IOException e2) {
hostscore = new ClusteredScoreMap<String>();
}
// process all documents
BlockingQueue<SolrDocument> docs = connector.concurrentDocumentsByQuery(
// process all documents at the webgraph for the outgoing links of this document
SolrDocument doc;
if (webgraphConnector != null) {
for (String host: hostscore.keyList(true)) {
if (hostscore.get(host) <= 0) continue;
// select all webgraph edges and modify their cr value
BlockingQueue<SolrDocument> docs = webgraphConnector.concurrentDocumentsByQuery(
WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + host + "\"",
0, 10000000, 60000, 50);
try {
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
boolean changed = false;
SolrInputDocument sid = segment.fulltext().getWebgraphConfiguration().toSolrInputDocument(doc, null);
byte[] id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()));
CRV crv = ranking.get(id);
if (crv != null) {
sid.setField(WebgraphSchema.source_cr_host_norm_i.getSolrFieldName(), crv.crn);
changed = true;
}
id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName()));
crv = ranking.get(id);
if (crv != null) {
sid.setField(WebgraphSchema.target_cr_host_norm_i.getSolrFieldName(), crv.crn);
changed = true;
}
if (changed) try {
webgraphConnector.add(sid);
} catch (SolrException e) {
} catch (IOException e) {
}
}
} catch (final InterruptedException e) {}
}
}
// process all documents in collection
BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(
(harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]",
0, 10000, 60000, 50);
SolrDocument doc;
int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0;
Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
Set<String> uniqueURLs = new HashSet<String>();
@ -976,7 +1009,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// send back to index
//connector.deleteById(ASCII.String(id));
connector.add(sid);
collectionConnector.add(sid);
proccount++;
} catch (final Throwable e1) {
}

@ -52,6 +52,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
source_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (source)"),
source_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url (source)"),
source_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"),
source_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the source link"),
source_host_s(SolrType.string, true, true, false, false, false, "host of the url (source)"),
source_host_id_s(SolrType.string, true, true, false, false, false, "id of the host (source)"),
@ -86,6 +87,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
target_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (target)"),
target_parameter_value_sxt(SolrType.string, true, true, true, false, true, "the values from key-value pairs in the search part of the url (target)"),
target_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"),
target_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host"),
target_host_s(SolrType.string, true, true, false, false, true, "host of the url (target)"),
target_host_id_s(SolrType.string, true, true, false, false, false, "id of the host (target)"),

Loading…
Cancel
Save