added directDocByURL attribute in crawl profile

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7985 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · cf4fd525ee
parent c61e4cfd78
commit cf4fd525ee
9 changed files with 47 additions and 19 deletions
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -516,6 +516,7 @@ proxyURL.rewriteURLs=domainlist
 # Be careful with this number. Consider a branching factor of average 20;
 # A prefetch-depth of 8 would index 25.600.000.000 pages, maybe the whole WWW.
 crawlingDepth=3
+crawlingDirectDocByURL=true
 crawlingIfOlder=-1
 crawlingDomFilterDepth=-1
 crawlingDomMaxPages=-1
@ -710,10 +711,6 @@ crawler.file.maxFileSize=100000000
 # maximum number of crawler threads
 crawler.MaxActiveThreads = 200

-# flag: consider all embedded image/audio/video document links
-# from all crawled documents as its own document
-crawler.embedLinksAsDocuments = true
-
 # maximum size of indexing queue
 indexer.slots = 100

--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@ -78,8 +78,11 @@
          </td>
        </tr>
        <tr valign="top" class="TableCellLight">
-          <td><label for="crawlingDepth">Crawling Depth</label>:</td>
-          <td><input name="crawlingDepth" id="crawlingDepth" type="text" size="2" maxlength="2" value="#[crawlingDepth]#" /></td>
+          <td>Crawling Depth:</td>
+          <td>
+            <input name="crawlingDepth" id="crawlingDepth" type="text" size="2" maxlength="2" value="#[crawlingDepth]#" />&nbsp;&nbsp;&nbsp;
+            <input type="checkbox" name="directDocByURL" id="directDocByURL" #(directDocByURLChecked)#::checked="checked"#(/directDocByURLChecked)# />also all linked non-parsable documents
+          </td>
          <td>
            This defines how often the Crawler will follow links (of links..) embedded in websites.
            0 means that only the page you enter under "Starting Point" will be added
--- a/htroot/CrawlStartExpert_p.java
+++ b/htroot/CrawlStartExpert_p.java
@ -42,6 +42,7 @@ public class CrawlStartExpert_p {
        prop.put("starturl", /*(intranet) ? repository :*/ "http://");
        prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
        prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
+        prop.put("directDocByURLChecked", sb.getConfigBool("crawlingDirectDocByURL", true) ? "1" : "0");
        prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL_STRING);
        prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
        prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL_STRING));
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -187,6 +187,9 @@ public class Crawler_p {
                env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
                if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;

+                final boolean directDocByURL = "on".equals(post.get("directDocByURL", "on"));
+                env.setConfig("crawlingDirectDocByURL", directDocByURL);
+
                // recrawl
                final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
                boolean crawlingIfOlderCheck = "on".equals(post.get("crawlingIfOlderCheck", "off"));
@ -262,6 +265,7 @@ public class Crawler_p {
                                ipMustNotMatch,
                                countryMustMatch,
                                newcrawlingdepth,
+                                directDocByURL,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
                                crawlingQ,
@ -321,6 +325,7 @@ public class Crawler_p {
                                ipMustNotMatch,
                                countryMustMatch,
                                newcrawlingdepth,
+                                directDocByURL,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
                                crawlingQ,
@ -444,6 +449,7 @@ public class Crawler_p {
                                    ipMustNotMatch,
                                    countryMustMatch,
                                    newcrawlingdepth,
+                                    false,
                                    crawlingIfOlder,
                                    crawlingDomMaxPages,
                                    crawlingQ,
@ -484,6 +490,7 @@ public class Crawler_p {
                                ipMustNotMatch,
                                countryMustMatch,
                				0,
+                				false,
                				crawlingIfOlder,
                				crawlingDomMaxPages,
                				true,
@ -528,6 +535,7 @@ public class Crawler_p {
                                ipMustNotMatch,
                                countryMustMatch,
                                newcrawlingdepth,
+                                directDocByURL,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
                                crawlingQ,
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@ -154,6 +154,7 @@ public class QuickCrawlLink_p {
                        "",
                        crawlingMustNotMatch,
                        CrawlingDepth,
+                        true,
                        60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
                        -1, // domMaxPages, if negative: no count restriction
                        crawlDynamic,
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@ -51,6 +51,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    public static final String NAME             = "name";
    public static final String START_URL        = "startURL";
    public static final String DEPTH            = "generalDepth";
+    public static final String DIRECT_DOC_BY_URL= "directDocByURL";
    public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
    public static final String DOM_MAX_PAGES    = "domMaxPages";
    public static final String CRAWLING_Q       = "crawlingQ";
@ -77,7 +78,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     * @param startURL root URL of the crawl
     * @param urlMustMatch URLs which do not match this regex will be ignored
     * @param urlMustNotMatch URLs which match this regex will be ignored
+     * @param ipMustMatch IPs from URLs which do not match this regex will be ignored
+     * @param ipMustNotMatch IPs from URLs which match this regex will be ignored
+     * @param countryMustMatch URLs from a specific country must match
     * @param depth height of the tree which will be created by the crawler
+     * @param directDocByURL if true, then linked documents that cannot be parsed are indexed as document
     * @param recrawlIfOlder documents which have been indexed in the past will
     * be indexed again if they are older than the time (ms) in this parameter
     * @param domMaxPages maximum number from one domain which will be indexed
@ -100,6 +105,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
                 final String ipMustNotMatch,
                 final String countryMustMatch,
                 final int depth,
+                 final boolean directDocByURL,
                 final long recrawlIfOlder /*date*/,
                 final int domMaxPages,
                 final boolean crawlingQ,
@ -127,6 +133,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        put(FILTER_IP_MUSTNOTMATCH,   (ipMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : ipMustNotMatch);
        put(FILTER_COUNTRY_MUSTMATCH, (countryMustMatch == null) ? "" : countryMustMatch);
        put(DEPTH,            depth);
+        put(DIRECT_DOC_BY_URL, directDocByURL);
        put(RECRAWL_IF_OLDER, recrawlIfOlder);
        put(DOM_MAX_PAGES,    domMaxPages);
        put(CRAWLING_Q,       crawlingQ); // crawling of urls with '?'
@ -298,6 +305,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        }
    }

+    public boolean directDocByURL() {
+        final String r = get(DIRECT_DOC_BY_URL);
+        if (r == null) return false;
+        return (r.equals(Boolean.TRUE.toString()));
+    }
+
    public CacheStrategy cacheStrategy() {
        final String r = get(CACHE_STRAGEGY);
        if (r == null) return CacheStrategy.IFEXIST;
--- a/source/de/anomic/crawler/CrawlSwitchboard.java
+++ b/source/de/anomic/crawler/CrawlSwitchboard.java
@ -233,6 +233,7 @@ public final class CrawlSwitchboard {
                    CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING,
                    "",
                    0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
+                    true,
                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
                    true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
                    true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
@ -243,38 +244,38 @@ public final class CrawlSwitchboard {
        }
        if (this.defaultRemoteProfile == null) {
            // generate new default entry for remote crawling
-            this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", CrawlProfile.MATCH_NEVER_STRING, 0,
+            this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", CrawlProfile.MATCH_NEVER_STRING, 0, true,
                    -1, -1, true, true, true, false, false, true, true, false, CacheStrategy.IFFRESH);
            this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile);
        }
        if (this.defaultTextSnippetLocalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
-            this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
+            this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true,
                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile);
        }
        if (this.defaultTextSnippetGlobalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
-            this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
+            this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true,
                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile);
        }
        this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
        if (this.defaultMediaSnippetLocalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
-            this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
+            this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true,
                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile);
        }
        if (this.defaultMediaSnippetGlobalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
-            this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
+            this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true,
                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile);
        }
        if (this.defaultSurrogateProfile == null) {
            // generate new default entry for surrogate parsing
-            this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
+            this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, false,
                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CacheStrategy.NOCACHE);
            this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile);
        }
--- a/source/net/yacy/cora/protocol/Domains.java
+++ b/source/net/yacy/cora/protocol/Domains.java
@ -875,18 +875,17 @@ public class Domains {
                );
    }

-    public static boolean isLocal(final String host, final InetAddress hostaddress) {
-        return isLocal(host, hostaddress, true);
-    }
-
    /**
     * check if the given host is a local address.
     * the hostaddress is optional and shall be given if the address is already known
     * @param host
     * @param hostaddress may be null if not known yet
-     * @param recursive
     * @return true if the given host is local
     */
+    public static boolean isLocal(final String host, final InetAddress hostaddress) {
+        return isLocal(host, hostaddress, true);
+    }
+
    private static boolean isLocal(final String host, InetAddress hostaddress, final boolean recursive) {

        if (noLocalCheck || // DO NOT REMOVE THIS! it is correct to return true if the check is off
@ -912,7 +911,7 @@ public class Domains {
        return isLocal(hostaddress);
    }

-    public static boolean isLocal(final InetAddress a) {
+    private static boolean isLocal(final InetAddress a) {
        final boolean
            localp = noLocalCheck || // DO NOT REMOVE THIS! it is correct to return true if the check is off
            a == null ||
@ -928,6 +927,8 @@ public class Domains {
     * find the locale for a given host. This feature is only available in full quality,
     * if the file InetAddressLocator.jar is placed in the /lib directory (as a plug-in)
     * from http://javainetlocator.sourceforge.net/
+     * In case that that you know the InetAddress of the host, DO NOT call this method but the
+     * other method with the InetAddress first to get better results.
     * @param host
     * @return the locale for the host
     */
@ -935,6 +936,7 @@ public class Domains {
        if (host == null) return null;
        final Locale locale = getLocale(dnsResolve(host));
        if (locale != null && locale.getCountry() != null && locale.getCountry().length() > 0) return locale;
+
        final int p = host.lastIndexOf('.');
        if (p < 0) return null;
        String tld = host.substring(p + 1).toUpperCase();
@ -945,6 +947,8 @@ public class Domains {

    /**
     * find the locale for a given Address
+     * This uses the InetAddressLocator.jar library
+     * TODO: integrate http://www.maxmind.com/app/geolitecountry
     * @param address
     * @return
     */
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -1930,7 +1930,7 @@ public final class Switchboard extends serverSwitch {
        assert response.getContent() != null;
        try {
            // parse the document
-            documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), response.getContent(), getConfigBool("crawler.embedLinksAsDocuments", false));
+            documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), response.getContent(), response.profile().directDocByURL());
            if (documents == null) {
                throw new Parser.Failure("Parser returned null.", response.url());
            }