documents pushed over the api/push_p.html interface will have their

unique flag set by default
pull/1/head
Michael Peter Christen 10 years ago
parent 0871e43fcc
commit 3e6c3e2237

@ -67,7 +67,6 @@ public final class CrawlSwitchboard {
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia"; public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia";
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia"; public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia";
public static final String CRAWL_PROFILE_SURROGATE = "surrogates"; public static final String CRAWL_PROFILE_SURROGATE = "surrogates";
public static final String CRAWL_PROFILE_PUSH_STUB = "push_";
public static Set<String> DEFAULT_PROFILES = new HashSet<String>(); public static Set<String> DEFAULT_PROFILES = new HashSet<String>();
static { static {
@ -517,7 +516,7 @@ public final class CrawlSwitchboard {
CrawlProfile genericPushProfile = this.defaultPushProfiles.get(collection); CrawlProfile genericPushProfile = this.defaultPushProfiles.get(collection);
if (genericPushProfile != null) return genericPushProfile; if (genericPushProfile != null) return genericPushProfile;
genericPushProfile = new CrawlProfile( genericPushProfile = new CrawlProfile(
CRAWL_PROFILE_PUSH_STUB + collection, CrawlProfile.CRAWL_PROFILE_PUSH_STUB + collection,
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
@ -545,7 +544,7 @@ public final class CrawlSwitchboard {
this.defaultPushProfiles.put(collection, genericPushProfile); this.defaultPushProfiles.put(collection, genericPushProfile);
return genericPushProfile; return genericPushProfile;
} }
private void resetProfiles() { private void resetProfiles() {
this.profilesActiveCrawlsCache.clear(); this.profilesActiveCrawlsCache.clear();
final File pdb = new File(this.queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES); final File pdb = new File(this.queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);

@ -58,6 +58,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final Pattern MATCH_ALL_PATTERN = Pattern.compile(MATCH_ALL_STRING); public static final Pattern MATCH_ALL_PATTERN = Pattern.compile(MATCH_ALL_STRING);
public static final Pattern MATCH_NEVER_PATTERN = Pattern.compile(MATCH_NEVER_STRING); public static final Pattern MATCH_NEVER_PATTERN = Pattern.compile(MATCH_NEVER_STRING);
public static final String CRAWL_PROFILE_PUSH_STUB = "push_";
// this is a simple record structure that hold all properties of a single crawl start // this is a simple record structure that hold all properties of a single crawl start
private static final String HANDLE = "handle"; private static final String HANDLE = "handle";
public static final String AGENT_NAME = "agentName"; public static final String AGENT_NAME = "agentName";
@ -648,6 +650,10 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if ("http".equals(protocol) || "https".equals(protocol)) protocol = "https?+"; if ("http".equals(protocol) || "https".equals(protocol)) protocol = "https?+";
return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host)).append(url.getPath()).append(".*").toString(); return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host)).append(url.getPath()).append(".*").toString();
} }
public boolean isPushCrawlProfile() {
return this.name().startsWith(CrawlProfile.CRAWL_PROFILE_PUSH_STUB);
}
public void putProfileEntry( public void putProfileEntry(
final String CRAWL_PROFILE_PREFIX, final String CRAWL_PROFILE_PREFIX,

@ -568,7 +568,7 @@ public class Segment {
// CREATE SOLR DOCUMENT // CREATE SOLR DOCUMENT
final CollectionConfiguration collectionConfig = this.fulltext.getDefaultConfiguration(); final CollectionConfiguration collectionConfig = this.fulltext.getDefaultConfiguration();
final CollectionConfiguration.SolrVector vector = collectionConfig.yacy2solr(this, collections, responseHeader, document, condenser, referrerURL, language, this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName); final CollectionConfiguration.SolrVector vector = collectionConfig.yacy2solr(this, collections, responseHeader, document, condenser, referrerURL, language, crawlProfile.isPushCrawlProfile(), this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName);
// ENRICH DOCUMENT WITH RANKING INFORMATION // ENRICH DOCUMENT WITH RANKING INFORMATION
this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null); this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null);

@ -409,7 +409,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
public SolrVector yacy2solr( public SolrVector yacy2solr(
final Segment segment, final Segment segment,
final Map<String, Pattern> collections, final ResponseHeader responseHeader, final Map<String, Pattern> collections, final ResponseHeader responseHeader,
final Document document, final Condenser condenser, final DigestURL referrerURL, final String language, final Document document, final Condenser condenser, final DigestURL referrerURL, final String language, final boolean setUnique,
final WebgraphConfiguration webgraph, final String sourceName) { final WebgraphConfiguration webgraph, final String sourceName) {
// we use the SolrCell design as index schema // we use the SolrCell design as index schema
SolrVector doc = new SolrVector(); SolrVector doc = new SolrVector();
@ -521,8 +521,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
add(doc, CollectionSchema.date_in_content_max_dt, date_in_content_max_dt); add(doc, CollectionSchema.date_in_content_max_dt, date_in_content_max_dt);
} }
} }
} }
if (allAttr || contains(CollectionSchema.keywords)) { if (allAttr || contains(CollectionSchema.keywords)) {
String keywords = document.dc_subject(' '); String keywords = document.dc_subject(' ');
@ -537,8 +535,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
} }
// unique-fields; these values must be corrected during postprocessing. (the following logic is !^ (not-xor) but I prefer to write it that way as it is) // unique-fields; these values must be corrected during postprocessing. (the following logic is !^ (not-xor) but I prefer to write it that way as it is)
add(doc, CollectionSchema.http_unique_b, UNIQUE_HEURISTIC_PREFER_HTTPS ? digestURL.isHTTPS() : digestURL.isHTTP()); // this must be corrected afterwards during storage! add(doc, CollectionSchema.http_unique_b, setUnique || UNIQUE_HEURISTIC_PREFER_HTTPS ? digestURL.isHTTPS() : digestURL.isHTTP()); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.www_unique_b, host != null && (UNIQUE_HEURISTIC_PREFER_WWWPREFIX ? host.startsWith("www.") : !host.startsWith("www."))); // this must be corrected afterwards during storage! add(doc, CollectionSchema.www_unique_b, setUnique || host != null && (UNIQUE_HEURISTIC_PREFER_WWWPREFIX ? host.startsWith("www.") : !host.startsWith("www."))); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.exact_signature_l, condenser.exactSignature()); add(doc, CollectionSchema.exact_signature_l, condenser.exactSignature());
add(doc, CollectionSchema.exact_signature_unique_b, true); // this must be corrected afterwards during storage! add(doc, CollectionSchema.exact_signature_unique_b, true); // this must be corrected afterwards during storage!

Loading…
Cancel
Save