diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 36a4de9f9..dff58b49b 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -167,23 +167,8 @@ public class Crawler_p { // special cases: if (crawlingStartURL!= null && fullDomain) { - if (crawlingStartURL.isFile()) { - newcrawlingMustMatch = "file://" + crawlingStartURL.getPath(); - } else if (crawlingStartURL.isSMB()) { - newcrawlingMustMatch = "smb://" + crawlingStartURL.getHost(); - } else if (crawlingStartURL.isFTP()) { - newcrawlingMustMatch = "ftp://" + crawlingStartURL.getHost(); - } else { - final String host = crawlingStartURL.getHost(); - if (host.startsWith("www.")) { - newcrawlingMustMatch = "https?://" + crawlingStartURL.getHost(); - } else { - // if the www is not given we accept that also - newcrawlingMustMatch = "https?://(www.)?" + crawlingStartURL.getHost(); - } - } - if (subPath) newcrawlingMustMatch += crawlingStartURL.getPath(); - newcrawlingMustMatch += ".*"; + newcrawlingMustMatch = CrawlProfile.mustMatchFilterFullDomain(crawlingStartURL); + if (subPath) newcrawlingMustMatch = newcrawlingMustMatch.substring(0, newcrawlingMustMatch.length() - 2) + crawlingStartURL.getPath() + ".*"; } if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) { newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*"; diff --git a/htroot/YMarks.html b/htroot/YMarks.html index 1f9c5119e..757b8de9a 100644 --- a/htroot/YMarks.html +++ b/htroot/YMarks.html @@ -9,6 +9,9 @@ + + + @@ -138,9 +141,9 @@ To see a list of all APIs, please visit the
-

Bookmark Importerhelp

+

Bookmark Importerhelp

- Netscape HTML
+ Netscape HTML
Firefox JSON
XBEL
Surrogate XML
@@ -154,7 +157,7 @@ To see a list of all APIs, please visit the


-

Folder settingshelp

+

Folder settingshelp

Source folder @@ -164,7 +167,7 @@ To see a list of all APIs, please visit the


-

Automatic tagginghelp

+

Automatic tagginghelp

Off
@@ -175,6 +178,19 @@ To see a list of all APIs, please visit the
Merging with existing tags


+

Automatic Indexinghelp

+

+ No indexing +
+ Index every bookmark entry +
+ Index every bookmark entry plus all directly linked pages +
+ Index all domains from all bookmarks completely +

+ also all media (image/movie/document) links +

+

@@ -195,7 +211,21 @@ To see a list of all APIs, please visit the
- + + + + + +
+ + + +
+ +

@@ -217,13 +247,6 @@ To see a list of all APIs, please visit the
Tags (comma separated):
-
- -
-
diff --git a/htroot/api/ymarks/import_ymark.java b/htroot/api/ymarks/import_ymark.java index 9097c27d7..40a647483 100644 --- a/htroot/api/ymarks/import_ymark.java +++ b/htroot/api/ymarks/import_ymark.java @@ -3,21 +3,27 @@ import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; +import java.util.Date; import java.util.Iterator; import java.util.concurrent.ArrayBlockingQueue; import java.util.regex.Pattern; import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.document.Parser.Failure; import net.yacy.document.content.SurrogateReader; import net.yacy.kelondro.blob.Tables; +import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; import org.xml.sax.SAXException; +import de.anomic.crawler.CrawlProfile; +import de.anomic.crawler.CrawlSwitchboard; +import de.anomic.crawler.retrieval.Request; import de.anomic.data.BookmarksDB; import de.anomic.data.UserDB; import de.anomic.data.WorkTables; @@ -54,6 +60,8 @@ public class import_ymark { boolean autotag = false; boolean merge = false; boolean empty = false; + final String indexing = post.get("indexing", "off"); + final boolean medialink = post.getBoolean("medialink", false); if(post.containsKey("autotag") && !post.get("autotag", "off").equals("off")) { autotag = true; @@ -67,7 +75,7 @@ public class import_ymark { t.start(); } - if(isAdmin && post.containsKey("table") && post.get("table").length() > 0) { + if(isAdmin && post.containsKey("table") && post.get("table").length() > 0) { bmk_user = post.get("table").substring(0, post.get("table").indexOf('_',0)); } if(post.containsKey("redirect") && post.get("redirect").length() > 0) { @@ -92,7 +100,7 @@ public class import_ymark { t = new Thread(surrogateReader, "YMarks - Surrogate Reader"); t.start(); while ((bmk = new YMarkEntry(surrogateReader.take())) != YMarkEntry.POISON) { - putBookmark(sb.tables.bookmarks, bmk_user, bmk, autoTaggingQueue, autotag, empty); + putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink); } prop.put("status", "1"); } else { @@ -110,7 +118,7 @@ public class import_ymark { t = new Thread(htmlImporter, "YMarks - HTML Importer"); t.start(); while ((bmk = htmlImporter.take()) != YMarkEntry.POISON) { - putBookmark(sb.tables.bookmarks, bmk_user, bmk, autoTaggingQueue, autotag, empty); + putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink); } prop.put("status", "1"); } else if(post.get("importer").equals("xbel") && reader != null) { @@ -127,7 +135,7 @@ public class import_ymark { t = new Thread(xbelImporter, "YMarks - XBEL Importer"); t.start(); while ((bmk = xbelImporter.take()) != YMarkEntry.POISON) { - putBookmark(sb.tables.bookmarks, bmk_user, bmk, autoTaggingQueue, autotag, empty); + putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink); } prop.put("status", "1"); } else if(post.get("importer").equals("json") && reader != null) { @@ -136,7 +144,7 @@ public class import_ymark { t = new Thread(jsonImporter, "YMarks - JSON Importer"); t.start(); while ((bmk = jsonImporter.take()) != YMarkEntry.POISON) { - putBookmark(sb.tables.bookmarks, bmk_user, bmk, autoTaggingQueue, autotag, empty); + putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink); } prop.put("status", "1"); } @@ -219,13 +227,13 @@ public class import_ymark { return prop; } - public static void putBookmark(final YMarkTables ymarks, final String bmk_user, final YMarkEntry bmk, - final ArrayBlockingQueue autoTaggingQueue, final boolean autotag, final boolean empty) { + public static void putBookmark(final Switchboard sb, final String bmk_user, final YMarkEntry bmk, + final ArrayBlockingQueue autoTaggingQueue, final boolean autotag, final boolean empty, final String indexing, final boolean medialink) { try { final String url = bmk.get(YMarkEntry.BOOKMARK.URL.key()); // other protocols could cause problems if(url != null && url.startsWith("http")) { - ymarks.addBookmark(bmk_user, bmk, true, true); + sb.tables.bookmarks.addBookmark(bmk_user, bmk, true, true); if(autotag) { if(!empty) { autoTaggingQueue.put(url); @@ -233,6 +241,16 @@ public class import_ymark { autoTaggingQueue.put(url); } } + + // fill crawler + if (indexing.equals("single")) { + crawlStart(sb, new DigestURI(url), CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, 0, true, medialink); + } else if (indexing.equals("onelink")) { + crawlStart(sb, new DigestURI(url), CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, 1, true, medialink); + } else if (indexing.equals("fulldomain")) { + final DigestURI u = new DigestURI(url); + crawlStart(sb, u, CrawlProfile.mustMatchFilterFullDomain(u), CrawlProfile.MATCH_NEVER_STRING, 99, false, medialink); + } } } catch (final IOException e) { Log.logException(e); @@ -242,6 +260,35 @@ public class import_ymark { Log.logException(e); } } + + public static String crawlStart( + final Switchboard sb, + final DigestURI startURL, + final String urlMustMatch, + final String urlMustNotMatch, + final int depth, + final boolean crawlingQ, final boolean medialink) { + final CrawlProfile pe = new CrawlProfile( + (startURL.getHost() == null) ? startURL.toNormalform(true, false) : startURL.getHost(), null, + urlMustMatch, + urlMustNotMatch, + CrawlProfile.MATCH_ALL_STRING, + CrawlProfile.MATCH_NEVER_STRING, + "", depth, medialink, + CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, crawlingQ, + true, true, true, false, true, true, true, + CacheStrategy.IFFRESH); + sb.crawler.putActive(pe.handle().getBytes(), pe); + return sb.crawlStacker.stackCrawl(new Request( + sb.peers.mySeed().hash.getBytes(), + startURL, + null, + "CRAWLING-ROOT", + new Date(), + pe.handle(), 0, 0, 0, 0 + )); + } + } diff --git a/htroot/env/yacy-ymarks.css b/htroot/env/yacy-ymarks.css index f0cc34c8e..50ab601bd 100644 --- a/htroot/env/yacy-ymarks.css +++ b/htroot/env/yacy-ymarks.css @@ -108,6 +108,10 @@ img.help { margin: 5px 5px 5px 5px; } +#bm_tags_tagsinput { + margin: 0px 0px 2px 5px; +} + /* YaCy Flexigrid ---------------------------*/ .flexigrid div.fbutton .burst { diff --git a/htroot/jquery/css/jquery.tagsinput.css b/htroot/jquery/css/jquery.tagsinput.css new file mode 100644 index 000000000..c595e249f --- /dev/null +++ b/htroot/jquery/css/jquery.tagsinput.css @@ -0,0 +1,7 @@ +div.tagsinput { border:1px solid #CCC; background: #FFF; padding:5px; width:300px; height:100px; overflow-y: auto;} +div.tagsinput span.tag { border: 1px solid #a5d24a; -moz-border-radius:2px; -webkit-border-radius:2px; display: block; float: left; padding: 5px; text-decoration:none; background: #cde69c; color: #638421; margin-right: 5px; margin-bottom:5px;font-family: helvetica; font-size:13px;} +div.tagsinput span.tag a { font-weight: bold; color: #82ad2b; text-decoration:none; font-size: 11px; } +div.tagsinput input { width:80px; margin:0px; font-family: helvetica; font-size: 13px; border:1px solid transparent; padding:5px; background: transparent; color: #000; outline:0px; margin-right:5px; margin-bottom:5px; } +div.tagsinput div { display:block; float: left; } +.tags_clear { clear: both; width: 100%; height: 0px; } +.not_valid {background: #FBD8DB !important; color: #90111A !important;} diff --git a/htroot/jquery/js/jquery.tagsinput.min.js b/htroot/jquery/js/jquery.tagsinput.min.js new file mode 100644 index 000000000..edc0d5f93 --- /dev/null +++ b/htroot/jquery/js/jquery.tagsinput.min.js @@ -0,0 +1 @@ +(function(a){var b=new Array;var c=new Array;a.fn.doAutosize=function(b){var c=a(this).data("minwidth"),d=a(this).data("maxwidth"),e="",f=a(this),g=a("#"+a(this).data("tester_id"));if(e===(e=f.val())){return}var h=e.replace(/&/g,"&").replace(/\s/g," ").replace(//g,">");g.html(h);var i=g.width(),j=i+b.comfortZone>=c?i+b.comfortZone:c,k=f.width(),l=j=c||j>c&&j").css({position:"absolute",top:-9999,left:-9999,width:"auto",fontSize:f.css("fontSize"),fontFamily:f.css("fontFamily"),fontWeight:f.css("fontWeight"),letterSpacing:f.css("letterSpacing"),whiteSpace:"nowrap"}),h=a(this).attr("id")+"_autosize_tester";if(!a("#"+h).length>0){g.attr("id",h);g.appendTo("body")}f.data("minwidth",c);f.data("maxwidth",d);f.data("tester_id",h);f.css("width",c)};a.fn.addTag=function(d,e){e=jQuery.extend({focus:false,callback:true},e);this.each(function(){var f=a(this).attr("id");var g=a(this).val().split(b[f]);if(g[0]==""){g=new Array}d=jQuery.trim(d);if(e.unique){var h=a(g).tagExist(d);if(h==true){a("#"+f+"_tag").addClass("not_valid")}}else{var h=false}if(d!=""&&h!=true){a("").addClass("tag").append(a("").text(d).append("  "),a("",{href:"#",title:"Removing tag",text:"x"}).click(function(){return a("#"+f).removeTag(escape(d))})).insertBefore("#"+f+"_addTag");g.push(d);a("#"+f+"_tag").val("");if(e.focus){a("#"+f+"_tag").focus()}else{a("#"+f+"_tag").blur()}a.fn.tagsInput.updateTagsField(this,g);if(e.callback&&c[f]&&c[f]["onAddTag"]){var i=c[f]["onAddTag"];i.call(this,d)}if(c[f]&&c[f]["onChange"]){var j=g.length;var i=c[f]["onChange"];i.call(this,a(this),g[j-1])}}});return false};a.fn.removeTag=function(d){d=unescape(d);this.each(function(){var e=a(this).attr("id");var f=a(this).val().split(b[e]);a("#"+e+"_tagsinput .tag").remove();str="";for(i=0;i=0};a.fn.importTags=function(b){id=a(this).attr("id");a("#"+id+"_tagsinput .tag").remove();a.fn.tagsInput.importTags(this,b)};a.fn.tagsInput=function(d){var e=jQuery.extend({interactive:true,defaultText:"add a tag",minChars:0,width:"300px",height:"100px",autocomplete:{selectFirst:false},hide:true,delimiter:",",unique:true,removeWithBackspace:true,placeholderColor:"#666666",autosize:true,comfortZone:20,inputPadding:6*2},d);this.each(function(){if(e.hide){a(this).hide()}var d=a(this).attr("id");var f=jQuery.extend({pid:d,real_input:"#"+d,holder:"#"+d+"_tagsinput",input_wrapper:"#"+d+"_addTag",fake_input:"#"+d+"_tag"},e);b[d]=f.delimiter;if(e.onAddTag||e.onRemoveTag||e.onChange){c[d]=new Array;c[d]["onAddTag"]=e.onAddTag;c[d]["onRemoveTag"]=e.onRemoveTag;c[d]["onChange"]=e.onChange}var g='
';if(e.interactive){g=g+''}g=g+'
';a(g).insertAfter(this);a(f.holder).css("width",e.width);a(f.holder).css("height",e.height);if(a(f.real_input).val()!=""){a.fn.tagsInput.importTags(a(f.real_input),a(f.real_input).val())}if(e.interactive){a(f.fake_input).val(a(f.fake_input).attr("data-default"));a(f.fake_input).css("color",e.placeholderColor);a(f.fake_input).resetAutosize(e);a(f.holder).bind("click",f,function(b){a(b.data.fake_input).focus()});a(f.fake_input).bind("focus",f,function(b){if(a(b.data.fake_input).val()==a(b.data.fake_input).attr("data-default")){a(b.data.fake_input).val("")}a(b.data.fake_input).css("color","#000000")});if(e.autocomplete_url!=undefined){autocomplete_options={source:e.autocomplete_url};for(attrname in e.autocomplete){autocomplete_options[attrname]=e.autocomplete[attrname]}if(jQuery.Autocompleter!==undefined){a(f.fake_input).autocomplete(e.autocomplete_url,e.autocomplete);a(f.fake_input).bind("result",f,function(b,c,f){if(c){a("#"+d).addTag(c[0]+"",{focus:true,unique:e.unique})}})}else if(jQuery.ui.autocomplete!==undefined){a(f.fake_input).autocomplete(autocomplete_options);a(f.fake_input).bind("autocompleteselect",f,function(b,c){a(b.data.real_input).addTag(c.item.value,{focus:true,unique:e.unique});return false})}}else{a(f.fake_input).bind("blur",f,function(b){var c=a(this).attr("data-default");if(a(b.data.fake_input).val()!=""&&a(b.data.fake_input).val()!=c){if(b.data.minChars<=a(b.data.fake_input).val().length&&(!b.data.maxChars||b.data.maxChars>=a(b.data.fake_input).val().length))a(b.data.real_input).addTag(a(b.data.fake_input).val(),{focus:true,unique:e.unique})}else{a(b.data.fake_input).val(a(b.data.fake_input).attr("data-default"));a(b.data.fake_input).css("color",e.placeholderColor)}return false})}a(f.fake_input).bind("keypress",f,function(b){if(b.which==b.data.delimiter.charCodeAt(0)||b.which==13){b.preventDefault();if(b.data.minChars<=a(b.data.fake_input).val().length&&(!b.data.maxChars||b.data.maxChars>=a(b.data.fake_input).val().length))a(b.data.real_input).addTag(a(b.data.fake_input).val(),{focus:true,unique:e.unique});a(b.data.fake_input).resetAutosize(e);return false}else if(b.data.autosize){a(b.data.fake_input).doAutosize(e)}});f.removeWithBackspace&&a(f.fake_input).bind("keydown",function(b){if(b.keyCode==8&&a(this).val()==""){b.preventDefault();var c=a(this).closest(".tagsinput").find(".tag:last").text();var d=a(this).attr("id").replace(/_tag$/,"");c=c.replace(/[\s]+x$/,"");a("#"+d).removeTag(escape(c));a(this).trigger("focus")}});a(f.fake_input).blur();if(f.unique){a(f.fake_input).keydown(function(b){if(b.keyCode==8||String.fromCharCode(b.which).match(/\w+|[áéíóúÁÉÍÓÚñÑ,/]+/)){a(this).removeClass("not_valid")}})}}return false});return this};a.fn.tagsInput.updateTagsField=function(c,d){var e=a(c).attr("id");a(c).val(d.join(b[e]))};a.fn.tagsInput.importTags=function(d,e){a(d).val("");var f=a(d).attr("id");var g=e.split(b[f]);for(i=0;i implements M public static long getRecrawlDate(final long oldTimeMinutes) { return System.currentTimeMillis() - (60000L * oldTimeMinutes); } + + public static String mustMatchFilterFullDomain(final MultiProtocolURI crawlingStartURL) { + if (crawlingStartURL.isFile()) { + return "file://" + crawlingStartURL.getPath() + ".*"; + } else if (crawlingStartURL.isSMB()) { + return "smb://" + crawlingStartURL.getHost() + ".*"; + } else if (crawlingStartURL.isFTP()) { + return "ftp://" + crawlingStartURL.getHost() + ".*"; + } else { + final String host = crawlingStartURL.getHost(); + if (host.startsWith("www.")) { + return "https?://" + crawlingStartURL.getHost() + ".*"; + } else { + // if the www is not given we accept that also + return "https?://(www.)?" + crawlingStartURL.getHost() + ".*"; + } + } + } }