added an option in the bookmark import process to put everything into the crawler

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8134 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 13 years ago
parent 77317a88e0
commit 11729061f2

@ -167,23 +167,8 @@ public class Crawler_p {
// special cases:
if (crawlingStartURL!= null && fullDomain) {
if (crawlingStartURL.isFile()) {
newcrawlingMustMatch = "file://" + crawlingStartURL.getPath();
} else if (crawlingStartURL.isSMB()) {
newcrawlingMustMatch = "smb://" + crawlingStartURL.getHost();
} else if (crawlingStartURL.isFTP()) {
newcrawlingMustMatch = "ftp://" + crawlingStartURL.getHost();
} else {
final String host = crawlingStartURL.getHost();
if (host.startsWith("www.")) {
newcrawlingMustMatch = "https?://" + crawlingStartURL.getHost();
} else {
// if the www is not given we accept that also
newcrawlingMustMatch = "https?://(www.)?" + crawlingStartURL.getHost();
}
}
if (subPath) newcrawlingMustMatch += crawlingStartURL.getPath();
newcrawlingMustMatch += ".*";
newcrawlingMustMatch = CrawlProfile.mustMatchFilterFullDomain(crawlingStartURL);
if (subPath) newcrawlingMustMatch = newcrawlingMustMatch.substring(0, newcrawlingMustMatch.length() - 2) + crawlingStartURL.getPath() + ".*";
}
if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";

@ -141,9 +141,9 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<div id="ymarks_import_tab">
<form action="/api/ymarks/import_ymark.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<input type="hidden" value="/YMarks.html" name="redirect">
<h4>Bookmark Importer<img title="help" alt="help" class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
<h4>Bookmark Importer<img alt="help" title="If you put in your bookmarks here, you can access them anywhere where you have access to your YaCy peer. Think of it as your 'personal cloud' for bookmarking." class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
<p>
<input type="radio" name="importer" value="html" /> Netscape HTML<br />
<input type="radio" name="importer" value="html" checked="checked" /> Netscape HTML<br />
<input type="radio" name="importer" value="json" /> Firefox JSON<br />
<input type="radio" name="importer" value="xbel" /> XBEL<br />
<input type="radio" name="importer" value="surro" /> Surrogate XML<br />
@ -157,7 +157,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<input class="input" type="file" name="bmkfile" id="bmkfile" size="8" /><br />
</p>
<hr />
<h4>Folder settings<img title="help" alt="help" class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
<h4>Folder settings<img alt="help" title="A folder structure is helpful to organize your bookmarks in a hierarchical way." class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
<p>
<small>Source folder</small>
<input class="input" type="text" name="source" id="source" value="" disabled="disabled" />
@ -167,7 +167,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<input class="input" type="text" name="root" id="root" value="/Imported Bookmarks" />
</p>
<hr />
<h4>Automatic tagging<img title="help" alt="help" class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
<h4>Automatic tagging<img alt="help" title="Tags are words that are attached to documents as metadata. It is possible to read all the documents and find the attached tags automatically." class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
<p>
<input type="radio" name="autotag" value="off" checked="checked" /> Off
<br />
@ -178,6 +178,19 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<input type="radio" name="autotag" value="merge" /> Merging with existing tags
</p>
<hr />
<h4>Automatic Indexing<img alt="help" title="While doing the bookmark import, YaCy can push all URLs to the indexing process" class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
<p>
<input type="radio" name="indexing" value="off" checked="checked" /> No indexing
<br />
<input type="radio" name="indexing" value="single" /> Index every bookmark entry
<br />
<input type="radio" name="indexing" value="onelink" /> Index every bookmark entry plus all directly linked pages
<br />
<input type="radio" name="indexing" value="fulldomain" /> Index all domains from all bookmarks completely
<br /><br />
<input type="checkbox" name="medialink" /> also all media (image/movie/document) links
</p>
<hr />
<p style="text-align: right">
<input type="submit" name="importbookmarks" value="Import" />
</p>

@ -3,21 +3,27 @@ import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.Iterator;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.regex.Pattern;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Parser.Failure;
import net.yacy.document.content.SurrogateReader;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import org.xml.sax.SAXException;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.crawler.retrieval.Request;
import de.anomic.data.BookmarksDB;
import de.anomic.data.UserDB;
import de.anomic.data.WorkTables;
@ -54,6 +60,8 @@ public class import_ymark {
boolean autotag = false;
boolean merge = false;
boolean empty = false;
final String indexing = post.get("indexing", "off");
final boolean medialink = post.getBoolean("medialink", false);
if(post.containsKey("autotag") && !post.get("autotag", "off").equals("off")) {
autotag = true;
@ -67,7 +75,7 @@ public class import_ymark {
t.start();
}
if(isAdmin && post.containsKey("table") && post.get("table").length() > 0) {
if(isAdmin && post.containsKey("table") && post.get("table").length() > 0) {
bmk_user = post.get("table").substring(0, post.get("table").indexOf('_',0));
}
if(post.containsKey("redirect") && post.get("redirect").length() > 0) {
@ -92,7 +100,7 @@ public class import_ymark {
t = new Thread(surrogateReader, "YMarks - Surrogate Reader");
t.start();
while ((bmk = new YMarkEntry(surrogateReader.take())) != YMarkEntry.POISON) {
putBookmark(sb.tables.bookmarks, bmk_user, bmk, autoTaggingQueue, autotag, empty);
putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink);
}
prop.put("status", "1");
} else {
@ -110,7 +118,7 @@ public class import_ymark {
t = new Thread(htmlImporter, "YMarks - HTML Importer");
t.start();
while ((bmk = htmlImporter.take()) != YMarkEntry.POISON) {
putBookmark(sb.tables.bookmarks, bmk_user, bmk, autoTaggingQueue, autotag, empty);
putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink);
}
prop.put("status", "1");
} else if(post.get("importer").equals("xbel") && reader != null) {
@ -127,7 +135,7 @@ public class import_ymark {
t = new Thread(xbelImporter, "YMarks - XBEL Importer");
t.start();
while ((bmk = xbelImporter.take()) != YMarkEntry.POISON) {
putBookmark(sb.tables.bookmarks, bmk_user, bmk, autoTaggingQueue, autotag, empty);
putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink);
}
prop.put("status", "1");
} else if(post.get("importer").equals("json") && reader != null) {
@ -136,7 +144,7 @@ public class import_ymark {
t = new Thread(jsonImporter, "YMarks - JSON Importer");
t.start();
while ((bmk = jsonImporter.take()) != YMarkEntry.POISON) {
putBookmark(sb.tables.bookmarks, bmk_user, bmk, autoTaggingQueue, autotag, empty);
putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink);
}
prop.put("status", "1");
}
@ -219,13 +227,13 @@ public class import_ymark {
return prop;
}
public static void putBookmark(final YMarkTables ymarks, final String bmk_user, final YMarkEntry bmk,
final ArrayBlockingQueue<String> autoTaggingQueue, final boolean autotag, final boolean empty) {
public static void putBookmark(final Switchboard sb, final String bmk_user, final YMarkEntry bmk,
final ArrayBlockingQueue<String> autoTaggingQueue, final boolean autotag, final boolean empty, final String indexing, final boolean medialink) {
try {
final String url = bmk.get(YMarkEntry.BOOKMARK.URL.key());
// other protocols could cause problems
if(url != null && url.startsWith("http")) {
ymarks.addBookmark(bmk_user, bmk, true, true);
sb.tables.bookmarks.addBookmark(bmk_user, bmk, true, true);
if(autotag) {
if(!empty) {
autoTaggingQueue.put(url);
@ -233,6 +241,16 @@ public class import_ymark {
autoTaggingQueue.put(url);
}
}
// fill crawler
if (indexing.equals("single")) {
crawlStart(sb, new DigestURI(url), CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, 0, true, medialink);
} else if (indexing.equals("onelink")) {
crawlStart(sb, new DigestURI(url), CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, 1, true, medialink);
} else if (indexing.equals("fulldomain")) {
final DigestURI u = new DigestURI(url);
crawlStart(sb, u, CrawlProfile.mustMatchFilterFullDomain(u), CrawlProfile.MATCH_NEVER_STRING, 99, false, medialink);
}
}
} catch (final IOException e) {
Log.logException(e);
@ -242,6 +260,35 @@ public class import_ymark {
Log.logException(e);
}
}
public static String crawlStart(
final Switchboard sb,
final DigestURI startURL,
final String urlMustMatch,
final String urlMustNotMatch,
final int depth,
final boolean crawlingQ, final boolean medialink) {
final CrawlProfile pe = new CrawlProfile(
(startURL.getHost() == null) ? startURL.toNormalform(true, false) : startURL.getHost(), null,
urlMustMatch,
urlMustNotMatch,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
"", depth, medialink,
CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, crawlingQ,
true, true, true, false, true, true, true,
CacheStrategy.IFFRESH);
sb.crawler.putActive(pe.handle().getBytes(), pe);
return sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(),
startURL,
null,
"CRAWLING-ROOT",
new Date(),
pe.handle(), 0, 0, 0, 0
));
}
}

@ -31,6 +31,7 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word;
@ -476,4 +477,22 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static long getRecrawlDate(final long oldTimeMinutes) {
return System.currentTimeMillis() - (60000L * oldTimeMinutes);
}
public static String mustMatchFilterFullDomain(final MultiProtocolURI crawlingStartURL) {
if (crawlingStartURL.isFile()) {
return "file://" + crawlingStartURL.getPath() + ".*";
} else if (crawlingStartURL.isSMB()) {
return "smb://" + crawlingStartURL.getHost() + ".*";
} else if (crawlingStartURL.isFTP()) {
return "ftp://" + crawlingStartURL.getHost() + ".*";
} else {
final String host = crawlingStartURL.getHost();
if (host.startsWith("www.")) {
return "https?://" + crawlingStartURL.getHost() + ".*";
} else {
// if the www is not given we accept that also
return "https?://(www.)?" + crawlingStartURL.getHost() + ".*";
}
}
}
}

Loading…
Cancel
Save