documents pushed over the api/push_p.html interface will have their

unique flag set by default
pull/1/head
Michael Peter Christen 10 years ago
parent 0871e43fcc
commit 3e6c3e2237

@ -67,7 +67,6 @@ public final class CrawlSwitchboard {
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia";
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia";
public static final String CRAWL_PROFILE_SURROGATE = "surrogates";
public static final String CRAWL_PROFILE_PUSH_STUB = "push_";
public static Set<String> DEFAULT_PROFILES = new HashSet<String>();
static {
@ -517,7 +516,7 @@ public final class CrawlSwitchboard {
CrawlProfile genericPushProfile = this.defaultPushProfiles.get(collection);
if (genericPushProfile != null) return genericPushProfile;
genericPushProfile = new CrawlProfile(
CRAWL_PROFILE_PUSH_STUB + collection,
CrawlProfile.CRAWL_PROFILE_PUSH_STUB + collection,
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
@ -545,7 +544,7 @@ public final class CrawlSwitchboard {
this.defaultPushProfiles.put(collection, genericPushProfile);
return genericPushProfile;
}
private void resetProfiles() {
this.profilesActiveCrawlsCache.clear();
final File pdb = new File(this.queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);

@ -58,6 +58,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final Pattern MATCH_ALL_PATTERN = Pattern.compile(MATCH_ALL_STRING);
public static final Pattern MATCH_NEVER_PATTERN = Pattern.compile(MATCH_NEVER_STRING);
public static final String CRAWL_PROFILE_PUSH_STUB = "push_";
// this is a simple record structure that hold all properties of a single crawl start
private static final String HANDLE = "handle";
public static final String AGENT_NAME = "agentName";
@ -648,6 +650,10 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if ("http".equals(protocol) || "https".equals(protocol)) protocol = "https?+";
return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host)).append(url.getPath()).append(".*").toString();
}
public boolean isPushCrawlProfile() {
return this.name().startsWith(CrawlProfile.CRAWL_PROFILE_PUSH_STUB);
}
public void putProfileEntry(
final String CRAWL_PROFILE_PREFIX,

@ -568,7 +568,7 @@ public class Segment {
// CREATE SOLR DOCUMENT
final CollectionConfiguration collectionConfig = this.fulltext.getDefaultConfiguration();
final CollectionConfiguration.SolrVector vector = collectionConfig.yacy2solr(this, collections, responseHeader, document, condenser, referrerURL, language, this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName);
final CollectionConfiguration.SolrVector vector = collectionConfig.yacy2solr(this, collections, responseHeader, document, condenser, referrerURL, language, crawlProfile.isPushCrawlProfile(), this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName);
// ENRICH DOCUMENT WITH RANKING INFORMATION
this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null);

@ -409,7 +409,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
public SolrVector yacy2solr(
final Segment segment,
final Map<String, Pattern> collections, final ResponseHeader responseHeader,
final Document document, final Condenser condenser, final DigestURL referrerURL, final String language,
final Document document, final Condenser condenser, final DigestURL referrerURL, final String language, final boolean setUnique,
final WebgraphConfiguration webgraph, final String sourceName) {
// we use the SolrCell design as index schema
SolrVector doc = new SolrVector();
@ -521,8 +521,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
add(doc, CollectionSchema.date_in_content_max_dt, date_in_content_max_dt);
}
}
}
if (allAttr || contains(CollectionSchema.keywords)) {
String keywords = document.dc_subject(' ');
@ -537,8 +535,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
// unique-fields; these values must be corrected during postprocessing. (the following logic is !^ (not-xor) but I prefer to write it that way as it is)
add(doc, CollectionSchema.http_unique_b, UNIQUE_HEURISTIC_PREFER_HTTPS ? digestURL.isHTTPS() : digestURL.isHTTP()); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.www_unique_b, host != null && (UNIQUE_HEURISTIC_PREFER_WWWPREFIX ? host.startsWith("www.") : !host.startsWith("www."))); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.http_unique_b, setUnique || UNIQUE_HEURISTIC_PREFER_HTTPS ? digestURL.isHTTPS() : digestURL.isHTTP()); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.www_unique_b, setUnique || host != null && (UNIQUE_HEURISTIC_PREFER_WWWPREFIX ? host.startsWith("www.") : !host.startsWith("www."))); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.exact_signature_l, condenser.exactSignature());
add(doc, CollectionSchema.exact_signature_unique_b, true); // this must be corrected afterwards during storage!

Loading…
Cancel
Save