From 3e6c3e2237090bcc9cdad78d743926539ec21a33 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 6 Jan 2015 15:22:59 +0100 Subject: [PATCH] documents pushed over the api/push_p.html interface will have their unique flag set by default --- source/net/yacy/crawler/CrawlSwitchboard.java | 5 ++--- source/net/yacy/crawler/data/CrawlProfile.java | 6 ++++++ source/net/yacy/search/index/Segment.java | 2 +- .../net/yacy/search/schema/CollectionConfiguration.java | 8 +++----- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index 12ccf4bbe..35f6b9305 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -67,7 +67,6 @@ public final class CrawlSwitchboard { public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia"; public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia"; public static final String CRAWL_PROFILE_SURROGATE = "surrogates"; - public static final String CRAWL_PROFILE_PUSH_STUB = "push_"; public static Set DEFAULT_PROFILES = new HashSet(); static { @@ -517,7 +516,7 @@ public final class CrawlSwitchboard { CrawlProfile genericPushProfile = this.defaultPushProfiles.get(collection); if (genericPushProfile != null) return genericPushProfile; genericPushProfile = new CrawlProfile( - CRAWL_PROFILE_PUSH_STUB + collection, + CrawlProfile.CRAWL_PROFILE_PUSH_STUB + collection, CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch @@ -545,7 +544,7 @@ public final class CrawlSwitchboard { this.defaultPushProfiles.put(collection, genericPushProfile); return genericPushProfile; } - + private void resetProfiles() { this.profilesActiveCrawlsCache.clear(); final File pdb = new File(this.queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES); diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index 47a12cf7f..1bcc83eab 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -58,6 +58,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M public static final Pattern MATCH_ALL_PATTERN = Pattern.compile(MATCH_ALL_STRING); public static final Pattern MATCH_NEVER_PATTERN = Pattern.compile(MATCH_NEVER_STRING); + public static final String CRAWL_PROFILE_PUSH_STUB = "push_"; + // this is a simple record structure that hold all properties of a single crawl start private static final String HANDLE = "handle"; public static final String AGENT_NAME = "agentName"; @@ -648,6 +650,10 @@ public class CrawlProfile extends ConcurrentHashMap implements M if ("http".equals(protocol) || "https".equals(protocol)) protocol = "https?+"; return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host)).append(url.getPath()).append(".*").toString(); } + + public boolean isPushCrawlProfile() { + return this.name().startsWith(CrawlProfile.CRAWL_PROFILE_PUSH_STUB); + } public void putProfileEntry( final String CRAWL_PROFILE_PREFIX, diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index b6d3ea2ca..898d1886b 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -568,7 +568,7 @@ public class Segment { // CREATE SOLR DOCUMENT final CollectionConfiguration collectionConfig = this.fulltext.getDefaultConfiguration(); - final CollectionConfiguration.SolrVector vector = collectionConfig.yacy2solr(this, collections, responseHeader, document, condenser, referrerURL, language, this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName); + final CollectionConfiguration.SolrVector vector = collectionConfig.yacy2solr(this, collections, responseHeader, document, condenser, referrerURL, language, crawlProfile.isPushCrawlProfile(), this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName); // ENRICH DOCUMENT WITH RANKING INFORMATION this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null); diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 8f416b028..e66963ae1 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -409,7 +409,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri public SolrVector yacy2solr( final Segment segment, final Map collections, final ResponseHeader responseHeader, - final Document document, final Condenser condenser, final DigestURL referrerURL, final String language, + final Document document, final Condenser condenser, final DigestURL referrerURL, final String language, final boolean setUnique, final WebgraphConfiguration webgraph, final String sourceName) { // we use the SolrCell design as index schema SolrVector doc = new SolrVector(); @@ -521,8 +521,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri add(doc, CollectionSchema.date_in_content_max_dt, date_in_content_max_dt); } } - - } if (allAttr || contains(CollectionSchema.keywords)) { String keywords = document.dc_subject(' '); @@ -537,8 +535,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } // unique-fields; these values must be corrected during postprocessing. (the following logic is !^ (not-xor) but I prefer to write it that way as it is) - add(doc, CollectionSchema.http_unique_b, UNIQUE_HEURISTIC_PREFER_HTTPS ? digestURL.isHTTPS() : digestURL.isHTTP()); // this must be corrected afterwards during storage! - add(doc, CollectionSchema.www_unique_b, host != null && (UNIQUE_HEURISTIC_PREFER_WWWPREFIX ? host.startsWith("www.") : !host.startsWith("www."))); // this must be corrected afterwards during storage! + add(doc, CollectionSchema.http_unique_b, setUnique || UNIQUE_HEURISTIC_PREFER_HTTPS ? digestURL.isHTTPS() : digestURL.isHTTP()); // this must be corrected afterwards during storage! + add(doc, CollectionSchema.www_unique_b, setUnique || host != null && (UNIQUE_HEURISTIC_PREFER_WWWPREFIX ? host.startsWith("www.") : !host.startsWith("www."))); // this must be corrected afterwards during storage! add(doc, CollectionSchema.exact_signature_l, condenser.exactSignature()); add(doc, CollectionSchema.exact_signature_unique_b, true); // this must be corrected afterwards during storage!