diff --git a/defaults/yacy.init b/defaults/yacy.init index c8cb23058..1f5de72de 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -221,6 +221,12 @@ proxyCache = DATA/HTCACHE proxyCacheSize = 100 proxyCacheSize__pro = 1024 +# a path to the surrogate input directory +surrogates.in = DATA/SURROGATES/in + +# a path to the surrogate output directory +surrogates.out = DATA/SURROGATES/out + # storage place for new releases releases = DATA/RELEASE diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java index 345abd311..0b852a406 100644 --- a/htroot/CrawlProfileEditor_p.java +++ b/htroot/CrawlProfileEditor_p.java @@ -114,7 +114,8 @@ public class CrawlProfileEditor_p { entry selentry; while (it.hasNext()) { selentry = it.next(); - if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_PROXY) || + if (selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_SURROGATE) || + selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_PROXY) || selentry.name().equals(plasmaWordIndex.CRAWL_PROFILE_REMOTE) /*|| selentry.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_TEXT) || selentry.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA)*/) diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateWWWLocalQueue_p.java index 4e9e72202..a72759b06 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.java +++ b/htroot/IndexCreateWWWLocalQueue_p.java @@ -100,7 +100,8 @@ public class IndexCreateWWWLocalQueue_p { name.equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) || name.equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) || name.equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) || - name.equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) + name.equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) || + name.equals(plasmaWordIndex.CRAWL_PROFILE_SURROGATE)) continue; if (compiledPattern.matcher(name).find()) { sb.webIndex.profilesActiveCrawls.removeEntry(entry.handle()); diff --git a/htroot/WatchWebStructure_p.java b/htroot/WatchWebStructure_p.java index c8e8f3b3d..1d16b7247 100644 --- a/htroot/WatchWebStructure_p.java +++ b/htroot/WatchWebStructure_p.java @@ -41,7 +41,8 @@ public class WatchWebStructure_p { e.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) || e.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) || e.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) || - e.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) + e.name().equals(plasmaWordIndex.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) || + e.name().equals(plasmaWordIndex.CRAWL_PROFILE_SURROGATE)) continue; host = e.name(); break; // take the first one diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 6e7aa8816..1ff7e501b 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -85,6 +85,7 @@ package de.anomic.plasma; +import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; @@ -120,8 +121,10 @@ import de.anomic.crawler.ResourceObserver; import de.anomic.crawler.ResultImages; import de.anomic.crawler.ResultURLs; import de.anomic.crawler.RobotsTxt; +import de.anomic.crawler.Surrogate; import de.anomic.crawler.ZURL; import de.anomic.crawler.CrawlProfile.entry; +import de.anomic.crawler.IndexingStack.QueueEntry; import de.anomic.data.Blacklist; import de.anomic.data.URLLicense; import de.anomic.data.blogBoard; @@ -165,6 +168,7 @@ import de.anomic.server.serverSemaphore; import de.anomic.server.serverSwitch; import de.anomic.server.serverThread; import de.anomic.tools.crypt; +import de.anomic.xml.SurrogateReader; import de.anomic.yacy.yacyClient; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacyNewsPool; @@ -200,6 +204,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch rankingPermissions; public plasmaWordIndex webIndex; public CrawlQueues crawlQueues; @@ -435,6 +441,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitchpublic static final String HTDOCS_PATH = "htDocsPath"

*

Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where all diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 184127b50..28538b0de 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -86,6 +86,8 @@ public final class plasmaWordIndex { public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT = "snippetGlobalText"; public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia"; public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia"; + public static final String CRAWL_PROFILE_SURROGATE = "surrogates"; + public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.heap"; public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.heap"; @@ -94,6 +96,7 @@ public final class plasmaWordIndex { public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; public static final long CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L; public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L; + public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L; public static final ByteOrder wordOrder = Base64Order.enhancedCoder; @@ -108,6 +111,7 @@ public final class plasmaWordIndex { public CrawlProfile.entry defaultRemoteProfile; public CrawlProfile.entry defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile; public CrawlProfile.entry defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile; + public CrawlProfile.entry defaultSurrogateProfile; private final File queuesRoot; private IODispatcher merger; @@ -297,6 +301,7 @@ public final class plasmaWordIndex { this.defaultTextSnippetGlobalProfile = null; this.defaultMediaSnippetLocalProfile = null; this.defaultMediaSnippetGlobalProfile = null; + this.defaultSurrogateProfile = null; final Iterator i = this.profilesActiveCrawls.profiles(true); CrawlProfile.entry profile; String name; @@ -310,6 +315,7 @@ public final class plasmaWordIndex { if (name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) this.defaultTextSnippetGlobalProfile = profile; if (name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) this.defaultMediaSnippetLocalProfile = profile; if (name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) this.defaultMediaSnippetGlobalProfile = profile; + if (name.equals(CRAWL_PROFILE_SURROGATE)) this.defaultSurrogateProfile = profile; } } catch (final Exception e) { this.profilesActiveCrawls.clear(); @@ -319,6 +325,7 @@ public final class plasmaWordIndex { this.defaultTextSnippetGlobalProfile = null; this.defaultMediaSnippetLocalProfile = null; this.defaultMediaSnippetGlobalProfile = null; + this.defaultSurrogateProfile = null; } if (this.defaultProxyProfile == null) { @@ -356,6 +363,11 @@ public final class plasmaWordIndex { defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.KEYWORDS_SNIPPET, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false); } + if (this.defaultSurrogateProfile == null) { + // generate new default entry for surrogate parsing + defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.KEYWORDS_SNIPPET, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, + this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false); + } } private void resetProfiles() { @@ -387,7 +399,8 @@ public final class plasmaWordIndex { (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) || (entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) || (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) || - (entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)))) { + (entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) || + (entry.name().equals(CRAWL_PROFILE_SURROGATE)))) { profilesPassiveCrawls.newEntry(entry.map()); iter.remove(); hasDoneSomething = true; diff --git a/source/de/anomic/xml/SurrogateReader.java b/source/de/anomic/xml/SurrogateReader.java index cc9f22ef7..13bcaddf2 100644 --- a/source/de/anomic/xml/SurrogateReader.java +++ b/source/de/anomic/xml/SurrogateReader.java @@ -101,7 +101,9 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato public void endElement(final String uri, final String name, final String tag) { if (tag == null) return; if ("document".equals(tag)) { + //System.out.println("A Title: " + this.surrogate.title()); this.surrogates.add(this.surrogate); + //System.out.println("B Title: " + this.surrogate.title()); this.surrogate = null; this.buffer.setLength(0); this.parsingValue = false; @@ -150,6 +152,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato Thread t = new Thread(sr); t.start(); Surrogate s; + System.out.println("1"); while (sr.hasNext()) { s = sr.next(); System.out.println("Title: " + s.title()); @@ -159,6 +162,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato System.out.println("Body: " + s.body()); System.out.println("Categories: " + s.categories()); } + System.out.println("2"); } catch (IOException e) { e.printStackTrace(); }