Merge branch 'master' of git://github.com/f1ori/yacy

pull/1/head
admin 13 years ago
commit 29f07fea33

@ -167,23 +167,8 @@ public class Crawler_p {
// special cases:
if (crawlingStartURL!= null && fullDomain) {
if (crawlingStartURL.isFile()) {
newcrawlingMustMatch = "file://" + crawlingStartURL.getPath();
} else if (crawlingStartURL.isSMB()) {
newcrawlingMustMatch = "smb://" + crawlingStartURL.getHost();
} else if (crawlingStartURL.isFTP()) {
newcrawlingMustMatch = "ftp://" + crawlingStartURL.getHost();
} else {
final String host = crawlingStartURL.getHost();
if (host.startsWith("www.")) {
newcrawlingMustMatch = "https?://" + crawlingStartURL.getHost();
} else {
// if the www is not given we accept that also
newcrawlingMustMatch = "https?://(www.)?" + crawlingStartURL.getHost();
}
}
if (subPath) newcrawlingMustMatch += crawlingStartURL.getPath();
newcrawlingMustMatch += ".*";
newcrawlingMustMatch = CrawlProfile.mustMatchFilterFullDomain(crawlingStartURL);
if (subPath) newcrawlingMustMatch = newcrawlingMustMatch.substring(0, newcrawlingMustMatch.length() - 2) + crawlingStartURL.getPath() + ".*";
}
if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";

@ -9,6 +9,9 @@
<link media="screen" type="text/css" href="/jquery/css/jquery.multiselect.css" rel="stylesheet" />
<script src="/jquery/js/jquery.multiselect.min.js" type="text/javascript"></script>
<link media="screen" type="text/css" href="/jquery/css/jquery.tagsinput.css" rel="stylesheet" />
<script src="/jquery/js/jquery.tagsinput.min.js" type="text/javascript"></script>
<link media="screen" type="text/css" href="/jquery/css/jquery.multiselect.filter.css" rel="stylesheet" />
<script src="/jquery/js/jquery.multiselect.filter.min.js" type="text/javascript"></script>
@ -138,9 +141,9 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<div id="ymarks_import_tab">
<form action="/api/ymarks/import_ymark.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<input type="hidden" value="/YMarks.html" name="redirect">
<h4>Bookmark Importer<img title="help" alt="help" class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
<h4>Bookmark Importer<img alt="help" title="If you put in your bookmarks here, you can access them anywhere where you have access to your YaCy peer. Think of it as your 'personal cloud' for bookmarking." class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
<p>
<input type="radio" name="importer" value="html" /> Netscape HTML<br />
<input type="radio" name="importer" value="html" checked="checked" /> Netscape HTML<br />
<input type="radio" name="importer" value="json" /> Firefox JSON<br />
<input type="radio" name="importer" value="xbel" /> XBEL<br />
<input type="radio" name="importer" value="surro" /> Surrogate XML<br />
@ -154,7 +157,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<input class="input" type="file" name="bmkfile" id="bmkfile" size="8" /><br />
</p>
<hr />
<h4>Folder settings<img title="help" alt="help" class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
<h4>Folder settings<img alt="help" title="A folder structure is helpful to organize your bookmarks in a hierarchical way." class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
<p>
<small>Source folder</small>
<input class="input" type="text" name="source" id="source" value="" disabled="disabled" />
@ -164,7 +167,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<input class="input" type="text" name="root" id="root" value="/Imported Bookmarks" />
</p>
<hr />
<h4>Automatic tagging<img title="help" alt="help" class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
<h4>Automatic tagging<img alt="help" title="Tags are words that are attached to documents as metadata. It is possible to read all the documents and find the attached tags automatically." class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
<p>
<input type="radio" name="autotag" value="off" checked="checked" /> Off
<br />
@ -175,6 +178,19 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<input type="radio" name="autotag" value="merge" /> Merging with existing tags
</p>
<hr />
<h4>Automatic Indexing<img alt="help" title="While doing the bookmark import, YaCy can push all URLs to the indexing process" class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
<p>
<input type="radio" name="indexing" value="off" checked="checked" /> No indexing
<br />
<input type="radio" name="indexing" value="single" /> Index every bookmark entry
<br />
<input type="radio" name="indexing" value="onelink" /> Index every bookmark entry plus all directly linked pages
<br />
<input type="radio" name="indexing" value="fulldomain" /> Index all domains from all bookmarks completely
<br /><br />
<input type="checkbox" name="medialink" /> also all media (image/movie/document) links
</p>
<hr />
<p style="text-align: right">
<input type="submit" name="importbookmarks" value="Import" />
</p>
@ -195,7 +211,21 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<!-- Bookmarks Edit Dialog -->
<div id="ymarks_add_dialog" class="bm_dialog" title="Add & Edit Bookmark">
<img id="bmaddimg" src="/yacy/ui/img-1/Star.png" />
<table>
<tr>
<td>
<img id="bmaddimg" src="/yacy/ui/img-1/Star.png" />
</td>
<td>
<label for="bm_public">Public:</label>
<br />
<select name="public" id="bm_public" class="bm_select">
<option value="true">yes</option>
<option value="false">no</option>
</select>
</td>
</tr>
</table>
<form id="bmaddform" method="post" accept-charset="UTF-8" action="jQuery"><div>
<label for="bm_url">URL:</label>
<br />
@ -217,13 +247,6 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<label for="bm_tags">Tags (comma separated):</label>
<br />
<input type="text" name="tags" id="bm_tags" class="bm_input" size="80" />
<br />
<label for="bm_public">Public:</label>
<br />
<select name="public" id="bm_public" class="bm_select">
<option value="true">yes</option>
<option value="false">no</option>
</select>
</div></form>
</div>
<div id="ymarks_crawlstart" class="bm_dialog" title="Craw Start">

@ -3,21 +3,27 @@ import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.Iterator;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.regex.Pattern;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Parser.Failure;
import net.yacy.document.content.SurrogateReader;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import org.xml.sax.SAXException;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.crawler.retrieval.Request;
import de.anomic.data.BookmarksDB;
import de.anomic.data.UserDB;
import de.anomic.data.WorkTables;
@ -54,6 +60,8 @@ public class import_ymark {
boolean autotag = false;
boolean merge = false;
boolean empty = false;
final String indexing = post.get("indexing", "off");
final boolean medialink = post.getBoolean("medialink", false);
if(post.containsKey("autotag") && !post.get("autotag", "off").equals("off")) {
autotag = true;
@ -67,7 +75,7 @@ public class import_ymark {
t.start();
}
if(isAdmin && post.containsKey("table") && post.get("table").length() > 0) {
if(isAdmin && post.containsKey("table") && post.get("table").length() > 0) {
bmk_user = post.get("table").substring(0, post.get("table").indexOf('_',0));
}
if(post.containsKey("redirect") && post.get("redirect").length() > 0) {
@ -92,7 +100,7 @@ public class import_ymark {
t = new Thread(surrogateReader, "YMarks - Surrogate Reader");
t.start();
while ((bmk = new YMarkEntry(surrogateReader.take())) != YMarkEntry.POISON) {
putBookmark(sb.tables.bookmarks, bmk_user, bmk, autoTaggingQueue, autotag, empty);
putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink);
}
prop.put("status", "1");
} else {
@ -110,7 +118,7 @@ public class import_ymark {
t = new Thread(htmlImporter, "YMarks - HTML Importer");
t.start();
while ((bmk = htmlImporter.take()) != YMarkEntry.POISON) {
putBookmark(sb.tables.bookmarks, bmk_user, bmk, autoTaggingQueue, autotag, empty);
putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink);
}
prop.put("status", "1");
} else if(post.get("importer").equals("xbel") && reader != null) {
@ -127,7 +135,7 @@ public class import_ymark {
t = new Thread(xbelImporter, "YMarks - XBEL Importer");
t.start();
while ((bmk = xbelImporter.take()) != YMarkEntry.POISON) {
putBookmark(sb.tables.bookmarks, bmk_user, bmk, autoTaggingQueue, autotag, empty);
putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink);
}
prop.put("status", "1");
} else if(post.get("importer").equals("json") && reader != null) {
@ -136,7 +144,7 @@ public class import_ymark {
t = new Thread(jsonImporter, "YMarks - JSON Importer");
t.start();
while ((bmk = jsonImporter.take()) != YMarkEntry.POISON) {
putBookmark(sb.tables.bookmarks, bmk_user, bmk, autoTaggingQueue, autotag, empty);
putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink);
}
prop.put("status", "1");
}
@ -219,13 +227,13 @@ public class import_ymark {
return prop;
}
public static void putBookmark(final YMarkTables ymarks, final String bmk_user, final YMarkEntry bmk,
final ArrayBlockingQueue<String> autoTaggingQueue, final boolean autotag, final boolean empty) {
public static void putBookmark(final Switchboard sb, final String bmk_user, final YMarkEntry bmk,
final ArrayBlockingQueue<String> autoTaggingQueue, final boolean autotag, final boolean empty, final String indexing, final boolean medialink) {
try {
final String url = bmk.get(YMarkEntry.BOOKMARK.URL.key());
// other protocols could cause problems
if(url != null && url.startsWith("http")) {
ymarks.addBookmark(bmk_user, bmk, true, true);
sb.tables.bookmarks.addBookmark(bmk_user, bmk, true, true);
if(autotag) {
if(!empty) {
autoTaggingQueue.put(url);
@ -233,6 +241,16 @@ public class import_ymark {
autoTaggingQueue.put(url);
}
}
// fill crawler
if (indexing.equals("single")) {
crawlStart(sb, new DigestURI(url), CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, 0, true, medialink);
} else if (indexing.equals("onelink")) {
crawlStart(sb, new DigestURI(url), CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, 1, true, medialink);
} else if (indexing.equals("fulldomain")) {
final DigestURI u = new DigestURI(url);
crawlStart(sb, u, CrawlProfile.mustMatchFilterFullDomain(u), CrawlProfile.MATCH_NEVER_STRING, 99, false, medialink);
}
}
} catch (final IOException e) {
Log.logException(e);
@ -242,6 +260,35 @@ public class import_ymark {
Log.logException(e);
}
}
public static String crawlStart(
final Switchboard sb,
final DigestURI startURL,
final String urlMustMatch,
final String urlMustNotMatch,
final int depth,
final boolean crawlingQ, final boolean medialink) {
final CrawlProfile pe = new CrawlProfile(
(startURL.getHost() == null) ? startURL.toNormalform(true, false) : startURL.getHost(), null,
urlMustMatch,
urlMustNotMatch,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
"", depth, medialink,
CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, crawlingQ,
true, true, true, false, true, true, true,
CacheStrategy.IFFRESH);
sb.crawler.putActive(pe.handle().getBytes(), pe);
return sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(),
startURL,
null,
"CRAWLING-ROOT",
new Date(),
pe.handle(), 0, 0, 0, 0
));
}
}

@ -108,6 +108,10 @@ img.help {
margin: 5px 5px 5px 5px;
}
#bm_tags_tagsinput {
margin: 0px 0px 2px 5px;
}
/* YaCy Flexigrid ---------------------------*/
.flexigrid div.fbutton .burst {

@ -0,0 +1,7 @@
div.tagsinput { border:1px solid #CCC; background: #FFF; padding:5px; width:300px; height:100px; overflow-y: auto;}
div.tagsinput span.tag { border: 1px solid #a5d24a; -moz-border-radius:2px; -webkit-border-radius:2px; display: block; float: left; padding: 5px; text-decoration:none; background: #cde69c; color: #638421; margin-right: 5px; margin-bottom:5px;font-family: helvetica; font-size:13px;}
div.tagsinput span.tag a { font-weight: bold; color: #82ad2b; text-decoration:none; font-size: 11px; }
div.tagsinput input { width:80px; margin:0px; font-family: helvetica; font-size: 13px; border:1px solid transparent; padding:5px; background: transparent; color: #000; outline:0px; margin-right:5px; margin-bottom:5px; }
div.tagsinput div { display:block; float: left; }
.tags_clear { clear: both; width: 100%; height: 0px; }
.not_valid {background: #FBD8DB !important; color: #90111A !important;}

File diff suppressed because one or more lines are too long

@ -19,6 +19,7 @@ function bm_action(com,grid) {
}
else if (com=='Add') {
$('#bmaddform').resetForm();
$('#bm_tags').importTags('');
$("#bm_url").removeAttr("disabled");
$("#bm_url").blur(function() {
var url = $("#bm_url").getValue();
@ -44,7 +45,8 @@ function bm_action(com,grid) {
}
$("#bm_title").setValue(title);
$("#bm_desc").setValue(desc);
$("#bm_tags").setValue(tags);
/* $("#bm_tags").setValue(tags); */
$('#bm_tags').importTags(tags);
}
});
});
@ -58,7 +60,8 @@ function bm_action(com,grid) {
$("#bm_url").setValue($('.trSelected',grid).find('.url').text());
$("#bm_title").setValue($('.trSelected',grid).find('h3.linktitle').text().trim());
$("#bm_desc").setValue($('.trSelected',grid).find('p.desc').text().trim());
$("#bm_tags").setValue($('.trSelected',grid).find('p.tags').text().trim().replace(/,\s/g,","));
$('#bm_tags').importTags($('.trSelected',grid).find('p.tags').text().trim().replace(/,\s/g,","));
/* $("#bm_tags").setValue($('.trSelected',grid).find('p.tags').text().trim().replace(/,\s/g,",")); */
$("#bm_path").setValue($('.trSelected',grid).find('p.folders').text().replace(/,\s/g,","));
$("#bm_public").setValue($('.trSelected',grid).find('img').attr('alt'));
$("#ymarks_add_dialog").dialog('open');
@ -111,10 +114,21 @@ function bm_action(com,grid) {
}
function bm_dialog() {
/* Init Tag Input */
$('#bm_tags').tagsInput({
'height':'105px',
'width':'270px',
'interactive':true,
'removeWithBackspace' : true,
'minChars' : 0,
'maxChars' : 0,
'placeholderColor' : '#666666'
});
/* Initialize Bookmark Dialog */
$("#ymarks_add_dialog").dialog({
autoOpen: false,
height: 450,
height: 500,
width: 340,
position: ['top',100],
modal: true,

@ -7,7 +7,7 @@ $(document).ready(function() {
qtag = "";
/* Initialize Bookmark Dialog */
bm_dialog();
bm_dialog();
/* Initialize Flexigrid */
$('#ymarks_flexigrid').flexigrid({

@ -31,6 +31,7 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word;
@ -476,4 +477,22 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static long getRecrawlDate(final long oldTimeMinutes) {
return System.currentTimeMillis() - (60000L * oldTimeMinutes);
}
public static String mustMatchFilterFullDomain(final MultiProtocolURI crawlingStartURL) {
if (crawlingStartURL.isFile()) {
return "file://" + crawlingStartURL.getPath() + ".*";
} else if (crawlingStartURL.isSMB()) {
return "smb://" + crawlingStartURL.getHost() + ".*";
} else if (crawlingStartURL.isFTP()) {
return "ftp://" + crawlingStartURL.getHost() + ".*";
} else {
final String host = crawlingStartURL.getHost();
if (host.startsWith("www.")) {
return "https?://" + crawlingStartURL.getHost() + ".*";
} else {
// if the www is not given we accept that also
return "https?://(www.)?" + crawlingStartURL.getHost() + ".*";
}
}
}
}

Loading…
Cancel
Save