- working direct importer for YaCy Crawl Starts
- working direct import for old bookmarks.db

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8052 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
apfelmaennchen 13 years ago
parent a635e43f40
commit 4f95f72124

@ -90,8 +90,8 @@
<input type="radio" name="importer" value="surro" /> Surrogate XML<br />
<input type="radio" name="importer" value="dmoz" disabled="disabled" /> DMOZ XML<br />
<input type="radio" name="importer" value="list" disabled="disabled"/> YaCy White/Black List<br />
<input type="radio" name="importer" value="bmks" disabled="disabled"/> YaCy bookmarks.db<br />
<input type="radio" name="importer" value="crawls" disabled="disabled"/> YaCy Crawl-Starts<br />
<input type="radio" name="importer" value="bmks" /> YaCy old bookmarks.db<br />
<input type="radio" name="importer" value="crawls" /> YaCy Crawl Starts<br />
</p>
<p>
<small>Bookmark file</small>
@ -110,7 +110,7 @@
<hr />
<h4>Automatic tagging<img title="help" alt="help" class="help" src="/yacy/ui/img-2/question_blue.png"></h4>
<p>
<input type="radio" name="autotag" value="off" /> Off
<input type="radio" name="autotag" value="off" checked="checked" /> Off
<br />
<input type="radio" name="autotag" value="empty" /> Only for empty tags
<br />

@ -2,23 +2,35 @@ import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.util.EnumMap;
import java.util.Iterator;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.regex.Pattern;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.document.Document;
import net.yacy.document.Parser.Failure;
import net.yacy.document.content.SurrogateReader;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import org.xml.sax.SAXException;
import de.anomic.data.BookmarksDB;
import de.anomic.data.UserDB;
import de.anomic.data.WorkTables;
import de.anomic.data.ymark.YMarkAutoTagger;
import de.anomic.data.ymark.YMarkEntry;
import de.anomic.data.ymark.YMarkHTMLImporter;
import de.anomic.data.ymark.YMarkJSONImporter;
import de.anomic.data.ymark.YMarkMetadata;
import de.anomic.data.ymark.YMarkTables;
import de.anomic.data.ymark.YMarkUtil;
import de.anomic.data.ymark.YMarkXBELImporter;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -69,7 +81,7 @@ public class import_ymark {
if(post.containsKey("root") && post.get("root").length() > 0) {
root = post.get("root");
}
if(post.containsKey("bmkfile") && post.containsKey("importer")){
if(post.containsKey("bmkfile") && !post.get("bmkfile").isEmpty() && post.containsKey("importer")){
stream = new ByteArrayInputStream(UTF8.getBytes(post.get("bmkfile$file")));
if(post.get("importer").equals("surro") && stream != null) {
SurrogateReader surrogateReader;
@ -133,7 +145,73 @@ public class import_ymark {
prop.put("result", "1");
}
}
}
} else if(post.containsKey("importer") && post.get("importer").equals("crawls")) {
try {
final Pattern pattern = Pattern.compile("^crawl start for.*");
final Iterator<Tables.Row> APIcalls = sb.tables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_COMMENT, pattern);
Tables.Row row = null;
while(APIcalls.hasNext()) {
row = APIcalls.next();
if(row.get(WorkTables.TABLE_API_COL_TYPE, "").equals("crawler")) {
final String url = row.get(WorkTables.TABLE_API_COL_COMMENT, "").substring(16);
final YMarkMetadata meta = new YMarkMetadata(new DigestURI(url), sb.indexSegments);
final Document document = meta.loadDocument(sb.loader);
final EnumMap<YMarkMetadata.METADATA, String> metadata = meta.loadMetadata();
final YMarkEntry bmk_entry = new YMarkEntry(false);
bmk_entry.put(YMarkEntry.BOOKMARK.URL.key(), url);
if(!sb.tables.has(YMarkTables.TABLES.BOOKMARKS.tablename(bmk_user), YMarkUtil.getBookmarkId(url))) {
bmk_entry.put(YMarkEntry.BOOKMARK.PUBLIC.key(), "false");
bmk_entry.put(YMarkEntry.BOOKMARK.TITLE.key(), metadata.get(YMarkMetadata.METADATA.TITLE));
bmk_entry.put(YMarkEntry.BOOKMARK.DESC.key(), metadata.get(YMarkMetadata.METADATA.DESCRIPTION));
}
bmk_entry.put(YMarkEntry.BOOKMARK.FOLDERS.key(), root);
if(autotag) {
bmk_entry.put(YMarkEntry.BOOKMARK.TAGS.key(), YMarkAutoTagger.autoTag(document, 3, sb.tables.bookmarks.getTags(bmk_user)));
}
sb.tables.bookmarks.addBookmark(bmk_user, bmk_entry, merge, true);
}
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (RowSpaceExceededException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (Failure e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else if(post.containsKey("importer") && post.get("importer").equals("bmks")) {
final Iterator<String> bit=sb.bookmarksDB.getBookmarksIterator(isAdmin);
BookmarksDB.Bookmark bookmark;
while(bit.hasNext()){
bookmark=sb.bookmarksDB.getBookmark(bit.next());
final YMarkEntry bmk_entry = new YMarkEntry(false);
bmk_entry.put(YMarkEntry.BOOKMARK.URL.key(), bookmark.getUrl());
try {
if(!sb.tables.has(YMarkTables.TABLES.BOOKMARKS.tablename(bmk_user), YMarkUtil.getBookmarkId(bookmark.getUrl()))) {
bmk_entry.put(YMarkEntry.BOOKMARK.PUBLIC.key(), bookmark.getPublic() ? "true" : "false");
bmk_entry.put(YMarkEntry.BOOKMARK.TITLE.key(), bookmark.getTitle());
bmk_entry.put(YMarkEntry.BOOKMARK.DESC.key(), bookmark.getDescription());
bmk_entry.put(YMarkEntry.BOOKMARK.TAGS.key(), bookmark.getTagsString());
bmk_entry.put(YMarkEntry.BOOKMARK.FOLDERS.key(), root+bookmark.getFoldersString().replaceAll(".*"+YMarkUtil.TAGS_SEPARATOR+YMarkUtil.FOLDERS_SEPARATOR, root+YMarkUtil.FOLDERS_SEPARATOR));
}
if(autotag) {
bmk_entry.put(YMarkEntry.BOOKMARK.TAGS.key(), YMarkAutoTagger.autoTag(bookmark.getUrl(), sb.loader, 3, sb.tables.bookmarks.getTags(bmk_user)));
}
sb.tables.bookmarks.addBookmark(bmk_user, bmk_entry, merge, true);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (RowSpaceExceededException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
if(post.containsKey("autotag") && !post.get("autotag", "off").equals("off")) {
try {
autoTaggingQueue.put(YMarkAutoTagger.POISON);

@ -18,6 +18,7 @@ function bm_action(com,grid) {
}
else if (com=='Add') {
$('#bmaddform').resetForm();
$("input[name='bm_url']").removeAttr("disabled");
$("#bm_url").blur(function() {
var url = $("input[name='bm_url']").getValue();
$.ajax({
@ -47,6 +48,7 @@ function bm_action(com,grid) {
alert("Editing of more than one selected bookmark is currently not supportet!");
return false;
}
$("input[name='bm_url']").attr("disabled","disabled");
$("input[name='bm_url']").setValue($('.trSelected',grid).find('.url').text());
$("input[name='bm_title']").setValue($('.trSelected',grid).find('h3.linktitle').text().trim());
$("textarea[name='bm_desc']").setValue($('.trSelected',grid).find('p.desc').text().trim());

@ -1,5 +1,6 @@
/* Initialize Tag Actions */
function tag_action(com,grid) {
alert("Sorry, the function you have requested is not yet available!");
if (com=='Add') {
flex = grid;
$('#tagaddform').resetForm();

@ -20,7 +20,7 @@ $(document).ready(function() {
{display: 'Title', name : 'title', width : 400, sortable : true, align: 'left'},
{display: 'Tags', name : 'tags', width : 160, sortable : false, align: 'left'},
{display: 'Folders', name : 'folders', width : 160, sortable : true, align: 'left', hide: true},
{display: 'Date added', name : 'date_added', width : 100, sortable : true, align: 'left', hide: true},
{display: 'Date added', name : 'date_added', width : 100, sortable : true, align: 'left'},
{display: 'Date modified', name : 'date_modified', width : 100, sortable : true, align: 'left'},
{display: 'Date visited', name : 'date_visited', width : 100, sortable : true, align: 'left', hide: true},
{display: 'API PK', name : 'apicall_pk', width : 85, sortable : true, align: 'left', hide: true},
@ -98,17 +98,34 @@ $(document).ready(function() {
});
$("#ymarks_treeview").bind("click", function(event) {
if ($(event.target).is("li") || $(event.target).parents("li").length) {
var folder = $(event.target).parents("li").filter(":first").attr("id");
$('#ymarks_flexigrid').flexOptions({
query: folder,
qtype: "_folder",
newp: 1
});
$('#ymarks_flexigrid').flexReload();
return false;
}
if ($(event.target).is("li") || $(event.target).parents("li").length) {
var folder = $(event.target).parents("li").filter(":first").attr("id");
$('#ymarks_flexigrid').flexOptions({
query: folder,
qtype: "_folder",
newp: 1
});
$('#ymarks_flexigrid').flexReload();
return false;
}
});
$('input[name=importer]').change(function() {
if ($("input[name=importer]:checked").val() == 'crawls') {
$("input[name='bmkfile']").attr("disabled","disabled");
$("input[name='root']").setValue("/Crawl Start");
}
else {
$("input[name='bmkfile']").removeAttr("disabled");
$("input[name='root']").setValue("/Imported Bookmarks");
}
if ($("input[name=importer]:checked").val() == 'bmks') {
$("input[name='bmkfile']").attr("disabled","disabled");
}
else {
$("input[name='bmkfile']").removeAttr("disabled");
}
});
$("#example").multiselect();

@ -139,7 +139,7 @@ public class YMarkEntry extends TreeMap<String, String> {
}
}
private void setDefaults() {
public void setDefaults() {
for (BOOKMARK b : BOOKMARK.values()) {
if(!b.deflt().isEmpty() && !this.containsKey(b.key())) {
this.put(b.key(), b.deflt());

@ -267,7 +267,11 @@ public class YMarkTables {
bmk_row = this.worktables.select(bmk_table, urlHash);
if (bmk_row == null) {
// create and insert new entry
this.worktables.insert(bmk_table, urlHash, bmk.getData());
if(!bmk.containsKey(YMarkEntry.BOOKMARK.DATE_ADDED.key())) {
bmk.put(YMarkEntry.BOOKMARK.DATE_ADDED.key(), date);
bmk.put(YMarkEntry.BOOKMARK.DATE_MODIFIED.key(), date);
}
this.worktables.insert(bmk_table, urlHash, bmk.getData());
} else {
// modify and update existing entry
HashSet<String> oldSet;

Loading…
Cancel
Save