From 78d6d6ca0640149bb10645ac142f05dd3bb90794 Mon Sep 17 00:00:00 2001 From: apfelmaennchen Date: Fri, 8 Apr 2011 21:15:10 +0000 Subject: [PATCH] refactoring for ymarks git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7648 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Table_YMark_p.java | 19 +- htroot/api/ymarks/add_ymark.java | 7 +- htroot/api/ymarks/delete_ymark.java | 5 +- htroot/api/ymarks/get_treeview.java | 180 +++--- htroot/api/ymarks/get_xbel.java | 171 +++--- htroot/api/ymarks/get_ymark.java | 44 +- htroot/api/ymarks/import_ymark.java | 25 +- source/de/anomic/data/WorkTables.java | 9 +- source/de/anomic/data/YMarkIndex.java | 259 --------- source/de/anomic/data/YMarkTables.java | 517 ------------------ .../anomic/data/YMarkWordCountComparator.java | 27 - .../de/anomic/data/ymark/YMarkCrawlStart.java | 90 +++ source/de/anomic/data/ymark/YMarkDate.java | 92 ++++ .../YMarkHTMLImporter.java} | 38 +- .../de/anomic/data/ymark/YMarkMetadata.java | 201 +++++++ source/de/anomic/data/ymark/YMarkTables.java | 340 ++++++++++++ source/de/anomic/data/ymark/YMarkUtil.java | 114 ++++ .../data/ymark/YMarkWordCountComparator.java | 53 ++ .../YMarkXBELImporter.java} | 66 ++- 19 files changed, 1208 insertions(+), 1049 deletions(-) delete mode 100644 source/de/anomic/data/YMarkIndex.java delete mode 100644 source/de/anomic/data/YMarkTables.java delete mode 100644 source/de/anomic/data/YMarkWordCountComparator.java create mode 100644 source/de/anomic/data/ymark/YMarkCrawlStart.java create mode 100644 source/de/anomic/data/ymark/YMarkDate.java rename source/de/anomic/data/{YMarksHTMLImporter.java => ymark/YMarkHTMLImporter.java} (75%) create mode 100644 source/de/anomic/data/ymark/YMarkMetadata.java create mode 100644 source/de/anomic/data/ymark/YMarkTables.java create mode 100644 source/de/anomic/data/ymark/YMarkUtil.java create mode 100644 source/de/anomic/data/ymark/YMarkWordCountComparator.java rename source/de/anomic/data/{YMarksXBELImporter.java => ymark/YMarkXBELImporter.java} (80%) diff --git a/htroot/Table_YMark_p.java b/htroot/Table_YMark_p.java index d2627580d..dd9da4b36 100644 --- a/htroot/Table_YMark_p.java +++ b/htroot/Table_YMark_p.java @@ -11,7 +11,8 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; -import de.anomic.data.YMarkTables; +import de.anomic.data.ymark.YMarkTables; +import de.anomic.data.ymark.YMarkUtil; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -59,6 +60,7 @@ public class Table_YMark_p { count = 0; byte[] key; String name; + /* try { Iterator iter = sb.tables.keys(YMarkTables.TABLES.TAGS.tablename(bmk_user)); while(iter.hasNext()) { @@ -86,6 +88,7 @@ public class Table_YMark_p { } catch (RowSpaceExceededException e) { Log.logException(e); } + */ final String counts = post.get("count", null); int maxcount = (counts == null || counts.equals("all")) ? Integer.MAX_VALUE : post.getInt("count", 10); @@ -132,13 +135,16 @@ public class Table_YMark_p { Log.logException(e); } + // apply rebuildIndex request + /* if (!post.get("rebuildindex", "").isEmpty()) try { sb.tables.bookmarks.folders.rebuildIndex(bmk_user); sb.tables.bookmarks.tags.rebuildIndex(bmk_user); } catch (IOException e) { Log.logException(e); } + */ if (!post.get("deleterows", "").isEmpty()) { for (final Map.Entry entry: post.entrySet()) { @@ -234,9 +240,12 @@ public class Table_YMark_p { try { Iterator mapIterator; if (post.containsKey("folders") && !post.get("folders").isEmpty()) { - mapIterator = sb.tables.orderByPK(sb.tables.bookmarks.folders.getBookmarks(bmk_user, post.get("folders")), maxcount).iterator(); + // mapIterator = sb.tables.orderByPK(sb.tables.bookmarks.folders.getBookmarks(bmk_user, post.get("folders")), maxcount).iterator(); + mapIterator = sb.tables.bookmarks.getBookmarksByFolder(bmk_user, post.get("folders")); } else if(post.containsKey("tags") && !post.get("tags").isEmpty()) { - mapIterator = sb.tables.orderByPK(sb.tables.bookmarks.tags.getBookmarks(bmk_user, post.get("tags")), maxcount).iterator(); + // mapIterator = sb.tables.orderByPK(sb.tables.bookmarks.tags.getBookmarks(bmk_user, post.get("tags")), maxcount).iterator(); + final String[] tagArray = YMarkUtil.cleanTagsString(post.get(YMarkTables.BOOKMARK.TAGS.key())).split(YMarkUtil.TAGS_SEPARATOR); + mapIterator = sb.tables.bookmarks.getBookmarksByTag(bmk_user, tagArray); } else { mapIterator = sb.tables.orderByPK(sb.tables.iterator(table, matcher), maxcount).iterator(); } @@ -261,9 +270,7 @@ public class Table_YMark_p { } } catch (IOException e) { Log.logException(e); - } catch (RowSpaceExceededException e) { - Log.logException(e); - } + } prop.put("showtable_list", count); prop.put("showtable_num", count); } diff --git a/htroot/api/ymarks/add_ymark.java b/htroot/api/ymarks/add_ymark.java index 49395c8cd..b194a63a4 100644 --- a/htroot/api/ymarks/add_ymark.java +++ b/htroot/api/ymarks/add_ymark.java @@ -4,8 +4,9 @@ import java.util.HashMap; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; -import de.anomic.data.YMarkTables; import de.anomic.data.UserDB; +import de.anomic.data.ymark.YMarkTables; +import de.anomic.data.ymark.YMarkUtil; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -39,8 +40,8 @@ public class add_ymark { data.put(YMarkTables.BOOKMARK.TITLE.key(), post.get(YMarkTables.BOOKMARK.TITLE.key(),YMarkTables.BOOKMARK.TITLE.deflt())); data.put(YMarkTables.BOOKMARK.DESC.key(), post.get(YMarkTables.BOOKMARK.DESC.key(),YMarkTables.BOOKMARK.DESC.deflt())); data.put(YMarkTables.BOOKMARK.PUBLIC.key(), post.get(YMarkTables.BOOKMARK.PUBLIC.key(),YMarkTables.BOOKMARK.PUBLIC.deflt())); - data.put(YMarkTables.BOOKMARK.TAGS.key(), YMarkTables.cleanTagsString(post.get(YMarkTables.BOOKMARK.TAGS.key(),YMarkTables.BOOKMARK.TAGS.deflt()))); - data.put(YMarkTables.BOOKMARK.FOLDERS.key(), YMarkTables.cleanFoldersString(post.get(YMarkTables.BOOKMARK.FOLDERS.key(),YMarkTables.FOLDERS_UNSORTED))); + data.put(YMarkTables.BOOKMARK.TAGS.key(), YMarkUtil.cleanTagsString(post.get(YMarkTables.BOOKMARK.TAGS.key(),YMarkTables.BOOKMARK.TAGS.deflt()))); + data.put(YMarkTables.BOOKMARK.FOLDERS.key(), YMarkUtil.cleanFoldersString(post.get(YMarkTables.BOOKMARK.FOLDERS.key(),YMarkTables.FOLDERS_UNSORTED))); try { sb.tables.bookmarks.addBookmark(bmk_user, data, false); diff --git a/htroot/api/ymarks/delete_ymark.java b/htroot/api/ymarks/delete_ymark.java index c8bd649c5..369a216cf 100644 --- a/htroot/api/ymarks/delete_ymark.java +++ b/htroot/api/ymarks/delete_ymark.java @@ -3,8 +3,9 @@ import java.io.IOException; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; -import de.anomic.data.YMarkTables; import de.anomic.data.UserDB; +import de.anomic.data.ymark.YMarkTables; +import de.anomic.data.ymark.YMarkUtil; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -29,7 +30,7 @@ public class delete_ymark { if(post.containsKey(YMarkTables.BOOKMARKS_ID)) { urlHash = post.get(YMarkTables.BOOKMARKS_ID).getBytes(); } else if(post.containsKey(YMarkTables.BOOKMARK.URL.key())) { - urlHash = YMarkTables.getBookmarkId(post.get(YMarkTables.BOOKMARK.URL.key())); + urlHash = YMarkUtil.getBookmarkId(post.get(YMarkTables.BOOKMARK.URL.key())); } else { prop.put("result", "0"); return prop; diff --git a/htroot/api/ymarks/get_treeview.java b/htroot/api/ymarks/get_treeview.java index 677041d85..a729d411e 100644 --- a/htroot/api/ymarks/get_treeview.java +++ b/htroot/api/ymarks/get_treeview.java @@ -9,19 +9,17 @@ import java.util.TreeMap; import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.RequestHeader; -import net.yacy.document.Document; import net.yacy.document.Parser.Failure; import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; -import de.anomic.crawler.CrawlProfile; -import de.anomic.crawler.retrieval.Response; import de.anomic.data.UserDB; -import de.anomic.data.YMarkTables; -import de.anomic.data.YMarkTables.METADATA; -import de.anomic.search.Segments; +import de.anomic.data.ymark.YMarkCrawlStart; +import de.anomic.data.ymark.YMarkMetadata; +import de.anomic.data.ymark.YMarkTables; +import de.anomic.data.ymark.YMarkUtil; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -49,6 +47,8 @@ public class get_treeview { boolean isFolder = true; boolean isBookmark = false; boolean isMetadata = false; + boolean isURLdb = false; + boolean isCrawlStart = false; boolean isWordCount = false; if (post != null){ @@ -63,29 +63,37 @@ public class get_treeview { } else if (post.get(ROOT).startsWith("m:")) { isMetadata = true; isFolder = false; + } else if (post.get(ROOT).startsWith("u:")) { + isURLdb = true; + isFolder = false; } else if (post.get(ROOT).startsWith("w:")) { isWordCount = true; isFolder = false; + } else if (post.get(ROOT).startsWith("c:")) { + isCrawlStart = true; + isFolder = false; } } } Iterator it = null; + Iterator bit = null; Tables.Row bmk_row = null; int count = 0; if(isFolder) { // loop through folderList - try { - it = sb.tables.bookmarks.folders.getFolders(bmk_user, root); + try { + // it = sb.tables.bookmarks.folders.getFolders(bmk_user, root); + it = sb.tables.bookmarks.getFolders(bmk_user, root).iterator(); } catch (IOException e) { Log.logException(e); } - int n = root.split(YMarkTables.FOLDERS_SEPARATOR).length; + int n = root.split(YMarkUtil.FOLDERS_SEPARATOR).length; if (n == 0) n = 1; while (it.hasNext()) { String folder = it.next(); - foldername = folder.split(YMarkTables.FOLDERS_SEPARATOR); + foldername = folder.split(YMarkUtil.FOLDERS_SEPARATOR); if (foldername.length == n+1) { prop.put("folders_"+count+"_foldername", foldername[n]); prop.put("folders_"+count+"_expanded", "false"); @@ -99,42 +107,40 @@ public class get_treeview { } // loop through bookmarkList try { - it = sb.tables.bookmarks.folders.getBookmarkIds(bmk_user, root).iterator(); - while (it.hasNext()) { - final String urlHash = it.next(); - bmk_row = sb.tables.select(YMarkTables.TABLES.BOOKMARKS.tablename(bmk_user), urlHash.getBytes()); - if(bmk_row != null) { - final String url = UTF8.String(bmk_row.get(YMarkTables.BOOKMARK.URL.key())); - final String title = bmk_row.get(YMarkTables.BOOKMARK.TITLE.key(), YMarkTables.BOOKMARK.TITLE.deflt()); - - // TODO: get_treeview - get rid of bmtype - if (post.containsKey("bmtype")) { - if (post.get("bmtype").equals("title")) { - prop.put("folders_"+count+"_foldername", title); - } else if (post.get("bmtype").equals("href")) { - prop.put("folders_"+count+"_foldername", - ""+title+""); - } - } else { - prop.put("folders_"+count+"_foldername", url); - } - prop.put("folders_"+count+"_expanded", "false"); - prop.put("folders_"+count+"_url", url); - prop.put("folders_"+count+"_type", "file"); - prop.put("folders_"+count+"_hash", "b:"+urlHash); - prop.put("folders_"+count+"_hasChildren", "true"); - prop.put("folders_"+count+"_comma", ","); - count++; - } - } + if(!root.isEmpty()) { + bit = sb.tables.bookmarks.getBookmarksByFolder(bmk_user, root); + while (bit.hasNext()) { + bmk_row = bit.next(); + if(bmk_row != null) { + final String url = UTF8.String(bmk_row.get(YMarkTables.BOOKMARK.URL.key())); + final String title = bmk_row.get(YMarkTables.BOOKMARK.TITLE.key(), YMarkTables.BOOKMARK.TITLE.deflt()); + + // TODO: get_treeview - get rid of bmtype + if (post.containsKey("bmtype")) { + if (post.get("bmtype").equals("title")) { + prop.putJSON("folders_"+count+"_foldername", title); + } else if (post.get("bmtype").equals("href")) { + prop.putJSON("folders_"+count+"_foldername", ""+title+""); + } + } else { + prop.putJSON("folders_"+count+"_foldername", url); + } + prop.put("folders_"+count+"_expanded", "false"); + prop.put("folders_"+count+"_url", url); + prop.put("folders_"+count+"_type", "file"); + prop.put("folders_"+count+"_hash", "b:"+new String(bmk_row.getPK())); + prop.put("folders_"+count+"_hasChildren", "true"); + prop.put("folders_"+count+"_comma", ","); + count++; + } + } + } count--; prop.put("folders_"+count+"_comma", ""); count++; prop.put("folders", count); } catch (IOException e) { Log.logException(e); - } catch (RowSpaceExceededException e) { - Log.logException(e); } } else if(isBookmark) { try { @@ -170,6 +176,16 @@ public class get_treeview { prop.put("folders_"+count+"_hash", "m:"+url); prop.put("folders_"+count+"_hasChildren", "true"); count++; + prop.put("folders_"+count+"_foldername","URLdb"); + putProp(count, "meta"); + prop.put("folders_"+count+"_hash", "u:"+url); + prop.put("folders_"+count+"_hasChildren", "true"); + count++; + prop.put("folders_"+count+"_foldername","CrawlStart"); + putProp(count, "meta"); + prop.put("folders_"+count+"_hash", "c:"+url); + prop.put("folders_"+count+"_hasChildren", "true"); + count++; prop.put("folders_"+count+"_foldername","WordCounts"); putProp(count, "meta"); prop.put("folders_"+count+"_hash", "w:"+url); @@ -183,46 +199,42 @@ public class get_treeview { } catch (RowSpaceExceededException e) { Log.logException(e); } - } else if (isWordCount || isMetadata) { + } else if (isWordCount || isMetadata || isURLdb || isCrawlStart) { try { - final DigestURI u = new DigestURI(post.get(ROOT).substring(2)); - Response response = null; - response = sb.loader.load(sb.loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE, true); - final Document document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); - if(document != null) { - if(isWordCount) { - final TreeMap words = YMarkTables.getWordCounts(document); - final ArrayList topwords = new ArrayList(words.descendingKeySet()); - for(int i = 0; i < 20 && i < topwords.size(); i++) { - String word = topwords.get(i); - int occur = words.get(word).occurrences(); - prop.put("folders_"+count+"_foldername",""+word+": [" + occur + "]"); - putProp(count, "meta"); - count++; - } - count--; - prop.put("folders_"+count+"_comma", ""); + final YMarkMetadata meta = new YMarkMetadata(new DigestURI(post.get(ROOT).substring(2)), sb.indexSegments); + meta.loadDocument(sb.loader); + if(isWordCount) { + final TreeMap words = meta.getWordCounts(); + final ArrayList topwords = new ArrayList(words.descendingKeySet()); + for(int i = 0; i < 20 && i < topwords.size(); i++) { + String word = topwords.get(i); + int occur = words.get(word).occurrences(); + prop.put("folders_"+count+"_foldername",""+word+": [" + occur + "]"); + putProp(count, "meta"); count++; - prop.put("folders", count); - } else if(isMetadata) { - EnumMap metadata; - metadata = YMarkTables.getMetadata(YMarkTables.getBookmarkId(post.get(ROOT).substring(2)), sb.indexSegments.segment(Segments.Process.PUBLIC)); - if (metadata.isEmpty()) - metadata = YMarkTables.getMetadata(document); - final Iterator iter = metadata.keySet().iterator(); - while (iter.hasNext()) { - final METADATA key = iter.next(); - final String value = metadata.get(key); - prop.put("folders_"+count+"_foldername",""+key.toString().toLowerCase()+": " + value + ""); - putProp(count, "meta"); - count++; - } - prop.put("folders_"+count+"_foldername","autotag: " + sb.tables.bookmarks.autoTag(document, bmk_user, 5) + ""); - putProp(count, "meta"); - count++; - prop.put("folders", count); - } - } + } + count--; + prop.put("folders_"+count+"_comma", ""); + count++; + prop.put("folders", count); + } else if(isMetadata) { + count = putMeta(count, meta.loadMetadata()); + } else if(isURLdb) { + count = putMeta(count, meta.getMetadata()); + } else if(isCrawlStart) { + Log.logInfo("YMark", "I am looking for CrawlStart: "+post.get(ROOT).substring(2)); + final YMarkCrawlStart crawlStart = new YMarkCrawlStart(sb.tables, post.get(ROOT).substring(2)); + final Iterator iter = crawlStart.keySet().iterator(); + String key; + while(iter.hasNext()) { + key = iter.next(); + prop.put("folders_"+count+"_foldername",""+key.toLowerCase()+": " + crawlStart.get(key) + ""); + putProp(count, "meta"); + count++; + } + prop.put("folders", count); + } + } catch (MalformedURLException e) { Log.logException(e); } catch (IOException e) { @@ -245,4 +257,16 @@ public class get_treeview { prop.put("folders_"+count+"_hasChildren", "false"); prop.put("folders_"+count+"_comma", ","); } + public static int putMeta(int count, final EnumMap metadata) { + final Iterator iter = metadata.keySet().iterator(); + while (iter.hasNext()) { + final YMarkMetadata.METADATA key = iter.next(); + final String value = metadata.get(key); + prop.put("folders_"+count+"_foldername",""+key.toString().toLowerCase()+": " + value + ""); + putProp(count, "meta"); + count++; + } + prop.put("folders", count); + return count; + } } diff --git a/htroot/api/ymarks/get_xbel.java b/htroot/api/ymarks/get_xbel.java index 6bed2c28c..b7bc24814 100644 --- a/htroot/api/ymarks/get_xbel.java +++ b/htroot/api/ymarks/get_xbel.java @@ -6,11 +6,12 @@ import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.RequestHeader; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.blob.Tables; -import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; -import de.anomic.data.YMarkTables; -import de.anomic.data.YMarksXBELImporter; import de.anomic.data.UserDB; +import de.anomic.data.ymark.YMarkDate; +import de.anomic.data.ymark.YMarkTables; +import de.anomic.data.ymark.YMarkUtil; +import de.anomic.data.ymark.YMarkXBELImporter; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -50,14 +51,15 @@ public class get_xbel { root = ""; } - final int root_depth = root.split(YMarkTables.FOLDERS_SEPARATOR).length; + final int root_depth = root.split(YMarkUtil.FOLDERS_SEPARATOR).length; Iterator fit = null; - Iterator bit = null; + Iterator bit = null; int count = 0; int n = root_depth; try { - fit = sb.tables.bookmarks.folders.getFolders(bmk_user, root); + // fit = sb.tables.bookmarks.folders.getFolders(bmk_user, root); + fit = sb.tables.bookmarks.getFolders(bmk_user, root).iterator(); } catch (IOException e) { Log.logException(e); } @@ -66,7 +68,7 @@ public class get_xbel { while (fit.hasNext()) { String folder = fit.next(); - foldername = folder.split(YMarkTables.FOLDERS_SEPARATOR); + foldername = folder.split(YMarkUtil.FOLDERS_SEPARATOR); if (n != root_depth && foldername.length <= n) { prop.put("xbel_"+count+"_elements", ""); count++; @@ -74,90 +76,95 @@ public class get_xbel { if (foldername.length >= n) { n = foldername.length; if(n != root_depth) { - prop.put("xbel_"+count+"_elements", ""); + prop.put("xbel_"+count+"_elements", ""); count++; prop.put("xbel_"+count+"_elements", "" + CharacterCoding.unicode2xml(foldername[n-1], true) + ""); count++; } - try { - bit = sb.tables.bookmarks.folders.getBookmarkIds(bmk_user, folder).iterator(); - Tables.Row bmk_row = null; - String urlHash; - while(bit.hasNext()){ - urlHash = bit.next(); - if(alias.contains(urlHash)) { - buffer.setLength(0); - buffer.append(YMarksXBELImporter.XBEL.ALIAS.startTag(true)); - buffer.append(" ref=\"b:"); - buffer.append(urlHash); - buffer.append("\"/>"); - prop.put("xbel_"+count+"_elements", buffer.toString()); - count++; - } else { - alias.add(urlHash); - bmk_row = sb.tables.select(YMarkTables.TABLES.BOOKMARKS.tablename(bmk_user), urlHash.getBytes()); - if(bmk_row != null) { - buffer.setLength(0); - - buffer.append(YMarksXBELImporter.XBEL.BOOKMARK.startTag(true)); - buffer.append(" id=\"b:"); - buffer.append(urlHash); - - buffer.append(YMarkTables.BOOKMARK.URL.xbel()); - buffer.append(CharacterCoding.unicode2xml(bmk_row.get(YMarkTables.BOOKMARK.URL.key(), YMarkTables.BOOKMARK.URL.deflt()), true)); - - buffer.append(YMarkTables.BOOKMARK.DATE_ADDED.xbel()); - buffer.append(CharacterCoding.unicode2xml(YMarkTables.getISO8601(bmk_row.get(YMarkTables.BOOKMARK.DATE_ADDED.key())), true)); - - buffer.append(YMarkTables.BOOKMARK.DATE_MODIFIED.xbel()); - buffer.append(CharacterCoding.unicode2xml(YMarkTables.getISO8601(bmk_row.get(YMarkTables.BOOKMARK.DATE_MODIFIED.key())), true)); - - buffer.append(YMarkTables.BOOKMARK.DATE_VISITED.xbel()); - buffer.append(CharacterCoding.unicode2xml(YMarkTables.getISO8601(bmk_row.get(YMarkTables.BOOKMARK.DATE_VISITED.key())), true)); - - buffer.append(YMarkTables.BOOKMARK.TAGS.xbel()); - buffer.append(bmk_row.get(YMarkTables.BOOKMARK.TAGS.key(), YMarkTables.BOOKMARK.TAGS.deflt())); - - buffer.append(YMarkTables.BOOKMARK.PUBLIC.xbel()); - buffer.append(bmk_row.get(YMarkTables.BOOKMARK.PUBLIC.key(), YMarkTables.BOOKMARK.PUBLIC.deflt())); - - buffer.append(YMarkTables.BOOKMARK.VISITS.xbel()); - buffer.append(bmk_row.get(YMarkTables.BOOKMARK.VISITS.key(), YMarkTables.BOOKMARK.VISITS.deflt())); - - buffer.append("\"\n>"); - prop.put("xbel_"+count+"_elements", buffer.toString()); - count++; - - buffer.setLength(0); - buffer.append(YMarksXBELImporter.XBEL.TITLE.startTag(false)); - buffer.append(CharacterCoding.unicode2xml(bmk_row.get(YMarkTables.BOOKMARK.TITLE.key(), YMarkTables.BOOKMARK.TITLE.deflt()), true)); - buffer.append(YMarksXBELImporter.XBEL.TITLE.endTag(false)); - prop.put("xbel_"+count+"_elements", buffer.toString()); - count++; - - buffer.setLength(0); - buffer.append(YMarksXBELImporter.XBEL.DESC.startTag(false)); - buffer.append(CharacterCoding.unicode2xml(bmk_row.get(YMarkTables.BOOKMARK.DESC.key(), YMarkTables.BOOKMARK.DESC.deflt()), true)); - buffer.append(YMarksXBELImporter.XBEL.DESC.endTag(false)); - prop.put("xbel_"+count+"_elements", buffer.toString()); - count++; - - prop.put("xbel_"+count+"_elements", YMarksXBELImporter.XBEL.BOOKMARK.endTag(false)); - count++; - } - } - } + // bit = sb.tables.bookmarks.folders.getBookmarkIds(bmk_user, folder).iterator(); + try { + bit = sb.tables.bookmarks.getBookmarksByFolder(bmk_user, folder); } catch (IOException e) { - Log.logException(e); - continue; - } catch (RowSpaceExceededException e) { - Log.logException(e); - continue; + // TODO: better error handling (avoid NPE) + bit = null; + } + Tables.Row bmk_row = null; + String urlHash; + final YMarkDate date = new YMarkDate(); + while(bit.hasNext()){ + // urlHash = bit.next(); + bmk_row = bit.next(); + urlHash = new String(bmk_row.getPK()); + + if(alias.contains(urlHash)) { + buffer.setLength(0); + buffer.append(YMarkXBELImporter.XBEL.ALIAS.startTag(true)); + buffer.append(" ref=\"b:"); + buffer.append(urlHash); + buffer.append("\"/>"); + prop.put("xbel_"+count+"_elements", buffer.toString()); + count++; + } else { + alias.add(urlHash); + // bmk_row = sb.tables.select(YMarkTables.TABLES.BOOKMARKS.tablename(bmk_user), urlHash.getBytes()); + if(bmk_row != null) { + buffer.setLength(0); + + buffer.append(YMarkXBELImporter.XBEL.BOOKMARK.startTag(true)); + buffer.append(" id=\"b:"); + buffer.append(urlHash); + + buffer.append(YMarkTables.BOOKMARK.URL.xbel()); + buffer.append(CharacterCoding.unicode2xml(bmk_row.get(YMarkTables.BOOKMARK.URL.key(), YMarkTables.BOOKMARK.URL.deflt()), true)); + + buffer.append(YMarkTables.BOOKMARK.DATE_ADDED.xbel()); + date.set(bmk_row.get(YMarkTables.BOOKMARK.DATE_ADDED.key())); + buffer.append(CharacterCoding.unicode2xml(date.toISO8601(), true)); + + buffer.append(YMarkTables.BOOKMARK.DATE_MODIFIED.xbel()); + date.set(bmk_row.get(YMarkTables.BOOKMARK.DATE_MODIFIED.key())); + buffer.append(CharacterCoding.unicode2xml(date.toISO8601(), true)); + + buffer.append(YMarkTables.BOOKMARK.DATE_VISITED.xbel()); + date.set(bmk_row.get(YMarkTables.BOOKMARK.DATE_VISITED.key())); + buffer.append(CharacterCoding.unicode2xml(date.toISO8601(), true)); + + buffer.append(YMarkTables.BOOKMARK.TAGS.xbel()); + buffer.append(bmk_row.get(YMarkTables.BOOKMARK.TAGS.key(), YMarkTables.BOOKMARK.TAGS.deflt())); + + buffer.append(YMarkTables.BOOKMARK.PUBLIC.xbel()); + buffer.append(bmk_row.get(YMarkTables.BOOKMARK.PUBLIC.key(), YMarkTables.BOOKMARK.PUBLIC.deflt())); + + buffer.append(YMarkTables.BOOKMARK.VISITS.xbel()); + buffer.append(bmk_row.get(YMarkTables.BOOKMARK.VISITS.key(), YMarkTables.BOOKMARK.VISITS.deflt())); + + buffer.append("\"\n>"); + prop.put("xbel_"+count+"_elements", buffer.toString()); + count++; + + buffer.setLength(0); + buffer.append(YMarkXBELImporter.XBEL.TITLE.startTag(false)); + buffer.append(CharacterCoding.unicode2xml(bmk_row.get(YMarkTables.BOOKMARK.TITLE.key(), YMarkTables.BOOKMARK.TITLE.deflt()), true)); + buffer.append(YMarkXBELImporter.XBEL.TITLE.endTag(false)); + prop.put("xbel_"+count+"_elements", buffer.toString()); + count++; + + buffer.setLength(0); + buffer.append(YMarkXBELImporter.XBEL.DESC.startTag(false)); + buffer.append(CharacterCoding.unicode2xml(bmk_row.get(YMarkTables.BOOKMARK.DESC.key(), YMarkTables.BOOKMARK.DESC.deflt()), true)); + buffer.append(YMarkXBELImporter.XBEL.DESC.endTag(false)); + prop.put("xbel_"+count+"_elements", buffer.toString()); + count++; + + prop.put("xbel_"+count+"_elements", YMarkXBELImporter.XBEL.BOOKMARK.endTag(false)); + count++; + } + } } } } while(n > root_depth) { - prop.put("xbel_"+count+"_elements", YMarksXBELImporter.XBEL.FOLDER.endTag(false)); + prop.put("xbel_"+count+"_elements", YMarkXBELImporter.XBEL.FOLDER.endTag(false)); count++; n--; } diff --git a/htroot/api/ymarks/get_ymark.java b/htroot/api/ymarks/get_ymark.java index ba5ca130c..9fe7d28b2 100644 --- a/htroot/api/ymarks/get_ymark.java +++ b/htroot/api/ymarks/get_ymark.java @@ -1,14 +1,13 @@ import java.io.IOException; import java.util.Iterator; -import java.util.TreeSet; import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.blob.Tables; -import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; -import de.anomic.data.YMarkTables; import de.anomic.data.UserDB; +import de.anomic.data.ymark.YMarkTables; +import de.anomic.data.ymark.YMarkUtil; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -28,22 +27,21 @@ public class get_ymark { final UserDB.Entry user = sb.userDB.getUser(header); final boolean isAdmin = (sb.verifyAuthentication(header, true)); final boolean isAuthUser = user!= null && user.hasRight(UserDB.AccessRight.BOOKMARK_RIGHT); - final TreeSet bookmarks = new TreeSet(); + Iterator bookmarks = null; if(isAdmin || isAuthUser) { final String bmk_user = (isAuthUser ? user.getUserName() : YMarkTables.USER_ADMIN); if(post.containsKey(YMarkTables.BOOKMARK.TAGS.key())) { tags = true; - final String[] tagArray = YMarkTables.cleanTagsString(post.get(YMarkTables.BOOKMARK.TAGS.key())).split(YMarkTables.TAGS_SEPARATOR); + final String[] tagArray = YMarkUtil.cleanTagsString(post.get(YMarkTables.BOOKMARK.TAGS.key())).split(YMarkUtil.TAGS_SEPARATOR); try { - bookmarks.addAll(sb.tables.bookmarks.tags.getBookmarkIds(bmk_user, tagArray)); + bookmarks = sb.tables.bookmarks.getBookmarksByTag(bmk_user, tagArray); } catch (IOException e) { Log.logException(e); - } catch (RowSpaceExceededException e) { - Log.logException(e); } } + /* if(post.containsKey(YMarkTables.BOOKMARK.FOLDERS.key())) { final String[] folderArray = YMarkTables.cleanFoldersString(post.get(YMarkTables.BOOKMARK.FOLDERS.key())).split(YMarkTables.TAGS_SEPARATOR); try { @@ -57,7 +55,8 @@ public class get_ymark { Log.logException(e); } } - putBookmarks(bookmarks, YMarkTables.TABLES.BOOKMARKS.tablename(bmk_user)); + */ + putBookmarks(bookmarks); } else { prop.put(YMarkTables.USER_AUTHENTICATE,YMarkTables.USER_AUTHENTICATE_MSG); @@ -66,25 +65,16 @@ public class get_ymark { return prop; } - private static void putBookmarks(final TreeSet urlSet, final String bmk_table) { - final IteratorurlIter = urlSet.iterator(); + private static void putBookmarks(final Iterator bit) { int count = 0; - while(urlIter.hasNext()) { - final byte[] urlHash = urlIter.next().getBytes(); - Tables.Row bmk_row = null; - try { - bmk_row = sb.tables.select(bmk_table, urlHash); - if (bmk_row != null) { - prop.putXML("bookmarks_"+count+"_id", UTF8.String(urlHash)); - for (YMarkTables.BOOKMARK bmk : YMarkTables.BOOKMARK.values()) { - prop.putXML("bookmarks_"+count+"_"+bmk.key(), bmk_row.get(bmk.key(),bmk.deflt())); - } - count++; - } - } catch (IOException e) { - Log.logException(e); - } catch (RowSpaceExceededException e) { - Log.logException(e); + while(bit.hasNext()) { + Tables.Row bmk_row = bit.next(); + if (bmk_row != null) { + prop.putXML("bookmarks_"+count+"_id", UTF8.String(bmk_row.getPK())); + for (YMarkTables.BOOKMARK bmk : YMarkTables.BOOKMARK.values()) { + prop.putXML("bookmarks_"+count+"_"+bmk.key(), bmk_row.get(bmk.key(),bmk.deflt())); + } + count++; } } prop.put("bookmarks", count); diff --git a/htroot/api/ymarks/import_ymark.java b/htroot/api/ymarks/import_ymark.java index c8b376f63..c06ab59bb 100644 --- a/htroot/api/ymarks/import_ymark.java +++ b/htroot/api/ymarks/import_ymark.java @@ -4,7 +4,6 @@ import java.util.HashMap; import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.RequestHeader; -import net.yacy.document.Document; import net.yacy.document.Parser.Failure; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.index.RowSpaceExceededException; @@ -12,12 +11,11 @@ import net.yacy.kelondro.logging.Log; import org.xml.sax.SAXException; -import de.anomic.crawler.CrawlProfile; -import de.anomic.crawler.retrieval.Response; import de.anomic.data.UserDB; -import de.anomic.data.YMarkTables; -import de.anomic.data.YMarksHTMLImporter; -import de.anomic.data.YMarksXBELImporter; +import de.anomic.data.ymark.YMarkHTMLImporter; +import de.anomic.data.ymark.YMarkMetadata; +import de.anomic.data.ymark.YMarkTables; +import de.anomic.data.ymark.YMarkXBELImporter; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -44,7 +42,7 @@ public class import_ymark { if(post.containsKey("bmkfile") && post.containsKey("importer")){ byteIn = new ByteArrayInputStream(UTF8.getBytes(post.get("bmkfile$file"))); if(post.get("importer").equals("html") && byteIn != null) { - final YMarksHTMLImporter htmlImporter = new YMarksHTMLImporter(byteIn, 100); + final YMarkHTMLImporter htmlImporter = new YMarkHTMLImporter(byteIn, 10); t = new Thread(htmlImporter, "YMarks - HTML Importer"); t.start(); while ((bmk = htmlImporter.take()) != YMarkTables.POISON) { @@ -52,10 +50,10 @@ public class import_ymark { } prop.put("result", "1"); } else if(post.get("importer").equals("xbel") && byteIn != null) { - final YMarksXBELImporter xbelImporter; + final YMarkXBELImporter xbelImporter; try { //TODO: make RootFold - xbelImporter = new YMarksXBELImporter(byteIn, 100, YMarkTables.FOLDERS_IMPORTED); + xbelImporter = new YMarkXBELImporter(byteIn, 100, YMarkTables.FOLDERS_IMPORTED); } catch (SAXException e) { //TODO: display an error message Log.logException(e); @@ -84,12 +82,9 @@ public class import_ymark { public static void putBookmark(final Switchboard sb, final String bmk_user, final HashMap bmk) { try { if(!bmk.containsKey(YMarkTables.BOOKMARK.TAGS.key()) || bmk.get(YMarkTables.BOOKMARK.TAGS.key()).equals(YMarkTables.BOOKMARK.TAGS.deflt())) { - final DigestURI u = new DigestURI(bmk.get(YMarkTables.BOOKMARK.URL.key())); - Response response = sb.loader.load(sb.loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE, true); - final Document document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); - if(document != null) { - bmk.put(YMarkTables.BOOKMARK.TAGS.key(), sb.tables.bookmarks.autoTag(document, bmk_user, 3)); - } + final YMarkMetadata meta = new YMarkMetadata(new DigestURI(bmk.get(YMarkTables.BOOKMARK.URL.key()))); + meta.loadDocument(sb.loader); + bmk.put(YMarkTables.BOOKMARK.TAGS.key(), meta.autoTag(3)); } sb.tables.bookmarks.addBookmark(bmk_user, bmk, true); } catch (IOException e) { diff --git a/source/de/anomic/data/WorkTables.java b/source/de/anomic/data/WorkTables.java index f06ccb122..af9542d18 100644 --- a/source/de/anomic/data/WorkTables.java +++ b/source/de/anomic/data/WorkTables.java @@ -1,4 +1,4 @@ -// Work.java +// WorkTables.java // (C) 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 04.02.2010 on http://yacy.net // @@ -47,6 +47,7 @@ import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.rwi.IndexCell; +import de.anomic.data.ymark.YMarkTables; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; @@ -87,12 +88,6 @@ public class WorkTables extends Tables { this.bookmarks = new YMarkTables(this); } - @Override - public void clear(final String tablename) throws IOException { - super.clear(tablename); - this.bookmarks.clearIndex(tablename); - } - /** * recording of a api call. stores the call parameters into the API database table * @param post the post arguments of the api call diff --git a/source/de/anomic/data/YMarkIndex.java b/source/de/anomic/data/YMarkIndex.java deleted file mode 100644 index d8d581908..000000000 --- a/source/de/anomic/data/YMarkIndex.java +++ /dev/null @@ -1,259 +0,0 @@ -package de.anomic.data; - -import java.io.IOException; -import java.util.HashSet; -import java.util.Iterator; -import java.util.TreeSet; -import java.util.regex.Pattern; - -import net.yacy.cora.document.UTF8; -import net.yacy.cora.storage.ConcurrentARC; -import net.yacy.kelondro.blob.Tables; -import net.yacy.kelondro.blob.Tables.Data; -import net.yacy.kelondro.blob.Tables.Row; -import net.yacy.kelondro.index.RowSpaceExceededException; -import net.yacy.kelondro.logging.Log; - -public class YMarkIndex { - - public static enum INDEX { - ID ("id", ""), - NAME ("name", ""), - DESC ("desc", ""), - URLS ("urls", ""); - - private String key; - private String dflt; - - private INDEX(String k, String s) { - this.key = k; - this.dflt = s; - } - public String key() { - return this.key; - } - public String deflt() { - return this.dflt; - } - } - - public static enum INDEX_ACTION { - ADD, - REMOVE - } - - public final static String PATTERN_PREFIX = "^\\Q"; - public final static String PATTERN_POSTFIX = YMarkTables.FOLDERS_SEPARATOR+"\\E.*$"; - - private final WorkTables worktables; - private final String table_basename; - private final ConcurrentARC cache; - - public YMarkIndex(final Tables wt, final String tb) { - this.worktables = (WorkTables)wt; - this.table_basename = tb; - this.cache = new ConcurrentARC(50,1); - } - - public String getKeyname(final String user, final byte[] key) throws IOException, RowSpaceExceededException { - final String index_table = user + this.table_basename; - Tables.Row row = this.worktables.select(index_table, key); - return row.get(INDEX.NAME.key(), INDEX.NAME.deflt()); - } - - public Iterator getFolders(final String user, final String root) throws IOException { - final String index_table = user + this.table_basename; - final TreeSet folders = new TreeSet(); - final Pattern r = Pattern.compile(PATTERN_PREFIX + root + PATTERN_POSTFIX); - final Iterator it = this.worktables.iterator(index_table, INDEX.NAME.key(), r); - final StringBuilder path = new StringBuilder(100); - Row folder; - - while (it.hasNext()) { - folder = it.next(); - path.setLength(0); - path.append(folder.get(INDEX.NAME.key(), INDEX.NAME.deflt())); - //TODO: get rid of .toString.equals() - while(path.length() > 0 && !path.toString().equals(root)){ - folders.add(path.toString()); - path.setLength(path.lastIndexOf(YMarkTables.FOLDERS_SEPARATOR)); - } - } - if (!root.equals(YMarkTables.FOLDERS_ROOT)) { folders.add(root); } - return folders.iterator(); - } - - protected void clearCache() { - this.cache.clear(); - } - - protected void createIndexEntry(final String user, final String keyname, final HashSet urlSet) throws IOException { - final byte[] key = YMarkTables.getKeyId(keyname); - final String index_table = user + this.table_basename; - final String cacheKey = index_table+":"+keyname; - final byte[] BurlSet = YMarkTables.keySetToBytes(urlSet); - Data tagEntry = new Data(); - this.cache.insert(cacheKey, BurlSet); - tagEntry.put(INDEX.NAME.key, keyname); - tagEntry.put(INDEX.URLS.key, BurlSet); - this.worktables.insert(index_table, key, tagEntry); - } - - protected void removeIndexEntry(final String user, String keysString, final byte[] urlHash) { - final String[] keyArray = keysString.split(YMarkTables.TAGS_SEPARATOR); - for (final String key : keyArray) { - this.updateIndexTable(user, key, urlHash, INDEX_ACTION.REMOVE); - } - } - - protected void insertIndexEntry(final String user, String keysString, final byte[] urlHash) { - final String[] keyArray = keysString.split(YMarkTables.TAGS_SEPARATOR); - for (final String key : keyArray) { - this.updateIndexTable(user, key, urlHash, INDEX_ACTION.ADD); - } - } - - protected void updateIndexEntry(final String user, final byte[] urlHash, final HashSet oldSet, final HashSet newSet) { - Iterator tagIter; - HashSet urlSet = new HashSet(newSet); - newSet.removeAll(oldSet); - tagIter = newSet.iterator(); - while(tagIter.hasNext()) { - this.updateIndexTable(user, tagIter.next(), urlHash, INDEX_ACTION.ADD); - } - oldSet.removeAll(urlSet); - tagIter=oldSet.iterator(); - while(tagIter.hasNext()) { - this.updateIndexTable(user, tagIter.next(), urlHash, INDEX_ACTION.REMOVE); - } - } - - public HashSet getBookmarkIds(final String user, final String keyname) throws IOException, RowSpaceExceededException { - final String index_table = user + this.table_basename; - final String cacheKey = index_table+":"+keyname; - if (this.cache.containsKey(cacheKey)) { - return YMarkTables.keysStringToSet(UTF8.String(this.cache.get(cacheKey))); - } else { - final Tables.Row idx_row = this.worktables.select(index_table, YMarkTables.getKeyId(keyname)); - if (idx_row != null) { - final byte[] keys = idx_row.get(INDEX.URLS.key); - this.cache.put(cacheKey, keys); - return YMarkTables.keysStringToSet(UTF8.String(keys)); - } - } - return new HashSet(); - } - - public Iterator getBookmarks(final String user, final String keyname) throws IOException, RowSpaceExceededException { - final Iterator bit = getBookmarkIds(user, keyname).iterator(); - final HashSet bookmarks = new HashSet(); - while(bit.hasNext()) { - bookmarks.add(this.worktables.select(YMarkTables.TABLES.BOOKMARKS.tablename(user), bit.next().getBytes())); - } - return bookmarks.iterator(); - } - - public HashSet getBookmarkIds(final String user, final String[] keyArray) throws IOException, RowSpaceExceededException { - final HashSet urlSet = new HashSet(); - urlSet.addAll(getBookmarkIds(user, keyArray[0])); - if (urlSet.isEmpty()) - return urlSet; - if (keyArray.length > 1) { - for (final String keyname : keyArray) { - urlSet.retainAll(getBookmarkIds(user, keyname)); - if (urlSet.isEmpty()) - return urlSet; - } - } - return urlSet; - } - - public void rebuildIndex(final String bmk_user) throws IOException { - final Iterator plainIterator = this.worktables.iterator(YMarkTables.TABLES.BOOKMARKS.tablename(bmk_user)); - this.clearCache(); - this.worktables.clear(bmk_user + this.table_basename); - while (plainIterator.hasNext()) { - Tables.Row row = plainIterator.next(); - if (row != null && row.containsKey(this.table_basename.substring(1))) { - final String url = UTF8.String(row.get(YMarkTables.BOOKMARK.URL.key())); - final String key = this.table_basename.substring(1); - final String keysString = row.get(key, YMarkTables.BOOKMARK.get(key).deflt()); - this.insertIndexEntry(bmk_user, keysString, YMarkTables.getBookmarkId(url)); - } - } - } - - /** - * YMark function that updates the tag/folder index - * @param user - * @param keyname - * @param url is the url has as returned by DigestURI.hash() - * @param action is either add (1) or remove (2) - */ - protected void updateIndexTable(final String user, final String keyname, final byte[] url, final INDEX_ACTION action) { - final String index_table = user + this.table_basename; - final String cacheKey = index_table+":"+keyname; - final byte[] key = YMarkTables.getKeyId(keyname); - final String urlHash = UTF8.String(url); - Tables.Row row = null; - - // try to load urlSet from cache - HashSeturlSet = this.cache.containsKey(cacheKey) ? YMarkTables.keysStringToSet(UTF8.String(this.cache.get(cacheKey))) : new HashSet(); - - try { - row = this.worktables.select(index_table, key); - - // key has no index_table entry - if(row == null) { - switch (action) { - case ADD: - urlSet.add(urlHash); - createIndexEntry(user, keyname, urlSet); - break; - case REMOVE: - // key has no index_table entry but a cache entry - // TODO: this shouldn't happen - if(!urlSet.isEmpty()) { - urlSet.remove(urlHash); - createIndexEntry(user, keyname, urlSet); - } - break; - default: - break; - } - } - // key has an existing index_table entry - else { - byte[] BurlSet = null; - // key has no cache entry - if (urlSet.isEmpty()) { - // load urlSet from index_table - urlSet = YMarkTables.keysStringToSet(UTF8.String(row.get(INDEX.URLS.key))); - } - switch (action) { - case ADD: - urlSet.add(urlHash); - break; - case REMOVE: - urlSet.remove(urlHash); - break; - default: - break; - } - if (urlSet.isEmpty()) { - this.cache.remove(cacheKey); - this.worktables.delete(index_table, key); - } else { - BurlSet = YMarkTables.keySetToBytes(urlSet); - this.cache.insert(cacheKey, BurlSet); - row.put(INDEX.URLS.key, BurlSet); - this.worktables.update(index_table, row); - } - } - } catch (IOException e) { - Log.logException(e); - } catch (RowSpaceExceededException e) { - Log.logException(e); - } - } -} diff --git a/source/de/anomic/data/YMarkTables.java b/source/de/anomic/data/YMarkTables.java deleted file mode 100644 index bf72d290f..000000000 --- a/source/de/anomic/data/YMarkTables.java +++ /dev/null @@ -1,517 +0,0 @@ -package de.anomic.data; - -import java.io.IOException; -import java.io.ByteArrayInputStream; -import java.io.UnsupportedEncodingException; -import java.net.MalformedURLException; -import java.text.ParseException; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Date; -import java.util.EnumMap; -import java.util.EnumSet; -import java.util.Enumeration; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Map; -import java.util.TreeMap; - -import net.yacy.cora.date.ISO8601Formatter; -import net.yacy.cora.document.UTF8; -import net.yacy.document.Condenser; -import net.yacy.document.Document; -import net.yacy.document.LibraryProvider; -import net.yacy.document.WordTokenizer; -import net.yacy.kelondro.blob.Tables; -import net.yacy.kelondro.blob.Tables.Data; -import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadataRow; -import net.yacy.kelondro.data.word.Word; -import net.yacy.kelondro.index.RowSpaceExceededException; -import net.yacy.kelondro.logging.Log; -import de.anomic.search.Segment; - -public class YMarkTables { - - public static enum TABLES { - BOOKMARKS ("_bookmarks"), - TAGS ("_tags"), - FOLDERS ("_folders"); - - private String basename; - - private TABLES(String b) { - this.basename = b; - } - public String basename() { - return this.basename; - } - public String tablename(String bmk_user) { - return bmk_user+this.basename; - } - } - - public static enum PROTOCOLS { - HTTP ("http://"), - HTTPS ("https://"); - - private String protocol; - - private PROTOCOLS(String s) { - this.protocol = s; - } - public String protocol() { - return this.protocol; - } - public String protocol(String s) { - return this.protocol+s; - } - } - - public static enum BOOKMARK { - // key dflt html_attrb xbel_attrb type - URL ("url", "", "href", "href", "link"), - TITLE ("title", "", "", "", "meta"), - DESC ("desc", "", "", "", "comment"), - DATE_ADDED ("date_added", "", "add_date", "added", "date"), - DATE_MODIFIED ("date_modified", "", "last_modified", "modified", "date"), - DATE_VISITED ("date_visited", "", "last_visited", "visited", "date"), - PUBLIC ("public", "flase", "", "yacy:public", "lock"), - TAGS ("tags", "unsorted", "shortcuturl", "yacy:tags", "tag"), - VISITS ("visits", "0", "", "yacy:visits", "stat"), - FOLDERS ("folders", "/unsorted", "", "", "folder"); - - private String key; - private String dflt; - private String html_attrb; - private String xbel_attrb; - private String type; - - private static final Map lookup = new HashMap(); - static { - for(BOOKMARK b : EnumSet.allOf(BOOKMARK.class)) - lookup.put(b.key(), b); - } - - private static StringBuilder buffer = new StringBuilder(25);; - - private BOOKMARK(String k, String s, String a, String x, String t) { - this.key = k; - this.dflt = s; - this.html_attrb = a; - this.xbel_attrb = x; - this.type = t; - } - public static BOOKMARK get(String key) { - return lookup.get(key); - } - public static boolean contains(String key) { - return lookup.containsKey(key); - } - public String key() { - return this.key; - } - public String deflt() { - return this.dflt; - } - public String html_attrb() { - return this.html_attrb; - } - public String xbel_attrb() { - return this.xbel_attrb; - } - public String xbel() { - buffer.setLength(0); - buffer.append('"'); - buffer.append('\n'); - buffer.append(' '); - buffer.append(this.xbel_attrb); - buffer.append('='); - buffer.append('"'); - return buffer.toString(); - } - public String type() { - return this.type; - } - } - - public enum METADATA { - TITLE, - DESCRIPTION, - FAVICON, - KEYWORDS, - LANGUAGE, - CREATOR, - PUBLISHER, - CHARSET, - MIMETYPE, - SIZE, - WORDCOUNT, - IN_URLDB, - FRESHDATE, - LOADDATE, - MODDATE, - SNIPPET - } - - public final static HashMap POISON = new HashMap(); - - public final static String TAGS_SEPARATOR = ","; - - public final static String FOLDERS_SEPARATOR = "/"; - public final static String FOLDERS_ROOT = "/"; - public final static String FOLDERS_UNSORTED = "/unsorted"; - public final static String FOLDERS_IMPORTED = "/imported"; - public static final int FOLDER_BUFFER_SIZE = 100; - - public final static String BOOKMARKS_LOG = "BOOKMARKS"; - public final static String BOOKMARKS_ID = "id"; - - public final static String USER_ADMIN = "admin"; - public final static String USER_AUTHENTICATE = "AUTHENTICATE"; - public final static String USER_AUTHENTICATE_MSG = "Authentication required!"; - - private WorkTables worktables; - public YMarkIndex tags; - public YMarkIndex folders; - - public YMarkTables(final Tables wt) { - this.worktables = (WorkTables)wt; - this.folders = new YMarkIndex(this.worktables, TABLES.FOLDERS.basename()); - this.tags = new YMarkIndex(this.worktables, TABLES.TAGS.basename()); - } - - public static Date parseISO8601(final String s) throws ParseException { - if(s == null || s.length() < 1) { - throw new ParseException("parseISO8601 - empty string, nothing to parse", 0); - } - SimpleDateFormat dateformat; - StringBuilder date = new StringBuilder(s); - if(s.length()==10) - dateformat = new SimpleDateFormat("yyyy-MM-dd"); - else { - dateformat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssz"); - if(date.charAt(date.length()-1) == 'Z') { - date.deleteCharAt(date.length()-1); - date.append("GMT-00:00"); - } else { - date.insert(date.length()-6, "GMT"); - } - } - return dateformat.parse(date.toString()); - } - - public static String getISO8601(final byte[] date) { - if(date != null) { - final String s = UTF8.String(date); - if(s != null && s.length() > 0) - return ISO8601Formatter.FORMATTER.format(new Date(Long.parseLong(s))); - } - return ""; - } - - public final static byte[] getBookmarkId(String url) throws MalformedURLException { - return (new DigestURI(url, null)).hash(); - } - - public final static byte[] getKeyId(final String tag) { - return Word.word2hash(tag.toLowerCase()); - } - - public final static byte[] keySetToBytes(final HashSet urlSet) { - return keySetToString(urlSet).getBytes(); - } - - public final static String keySetToString(final HashSet urlSet) { - final Iterator urlIter = urlSet.iterator(); - final - StringBuilder urls = new StringBuilder(urlSet.size()*20); - while(urlIter.hasNext()) { - urls.append(TAGS_SEPARATOR); - urls.append(urlIter.next()); - } - urls.deleteCharAt(0); - return urls.toString(); - } - - public final static HashSet keysStringToSet(final String keysString) { - HashSet keySet = new HashSet(); - final String[] keyArray = keysString.split(TAGS_SEPARATOR); - for (final String key : keyArray) { - keySet.add(key); - } - return keySet; - } - - public final static String cleanTagsString(final String tagsString) { - StringBuilder ts = new StringBuilder(tagsString); - if(ts.length() == 0) - return YMarkTables.BOOKMARK.TAGS.deflt(); - // get rid of double commas and space characters following a comma - for (int i = 0; i < ts.length()-1; i++) { - if (ts.charAt(i) == TAGS_SEPARATOR.charAt(0)) { - if (ts.charAt(i+1) == TAGS_SEPARATOR.charAt(0) || ts.charAt(i+1) == ' ') { - ts.deleteCharAt(i+1); - i--; - } - } - } - // get rid of heading and trailing comma - if (ts.charAt(0) == TAGS_SEPARATOR.charAt(0)) - ts.deleteCharAt(0); - if (ts.charAt(ts.length()-1) == TAGS_SEPARATOR.charAt(0)) - ts.deleteCharAt(ts.length()-1); - return ts.toString(); - } - - public final static String cleanFoldersString(final String foldersString) { - StringBuilder fs = new StringBuilder(cleanTagsString(foldersString)); - if(fs.length() == 0) - return YMarkTables.BOOKMARK.FOLDERS.deflt(); - for (int i = 0; i < fs.length()-1; i++) { - if (fs.charAt(i) == FOLDERS_SEPARATOR.charAt(0)) { - if (fs.charAt(i+1) == TAGS_SEPARATOR.charAt(0) || fs.charAt(i+1) == FOLDERS_SEPARATOR.charAt(0)) { - fs.deleteCharAt(i); - i--; - } else if (fs.charAt(i+1) == ' ') { - fs.deleteCharAt(i+1); - i--; - } - } - } - if (fs.charAt(fs.length()-1) == FOLDERS_SEPARATOR.charAt(0)) { - fs.deleteCharAt(fs.length()-1); - } - return fs.toString(); - } - - public void clearIndex(String tablename) { - if (tablename.endsWith(TABLES.TAGS.basename())) - this.tags.clearCache(); - if (tablename.endsWith(TABLES.FOLDERS.basename())) - this.folders.clearCache(); - } - - public void deleteBookmark(final String bmk_user, final byte[] urlHash) throws IOException, RowSpaceExceededException { - final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); - Tables.Row bmk_row = null; - bmk_row = this.worktables.select(bmk_table, urlHash); - if(bmk_row != null) { - final String tagsString = bmk_row.get(YMarkTables.BOOKMARK.TAGS.key(),YMarkTables.BOOKMARK.TAGS.deflt()); - tags.removeIndexEntry(bmk_user, tagsString, urlHash); - final String foldersString = bmk_row.get(YMarkTables.BOOKMARK.FOLDERS.key(),YMarkTables.FOLDERS_ROOT); - folders.removeIndexEntry(bmk_user, foldersString, urlHash); - this.worktables.delete(bmk_table,urlHash); - } - } - - public void deleteBookmark(final String bmk_user, final String url) throws IOException, RowSpaceExceededException { - this.deleteBookmark(bmk_user, getBookmarkId(url)); - } - - public void addBookmark(final String bmk_user, final HashMap bmk, final boolean importer) throws IOException, RowSpaceExceededException { - final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); - final String date = String.valueOf(System.currentTimeMillis()); - final byte[] urlHash = getBookmarkId(bmk.get(BOOKMARK.URL.key())); - Tables.Row bmk_row = null; - - if (urlHash != null) { - bmk_row = this.worktables.select(bmk_table, urlHash); - if (bmk_row == null) { - // create and insert new entry - final Data data = new Data(); - for (BOOKMARK b : BOOKMARK.values()) { - switch(b) { - case DATE_ADDED: - case DATE_MODIFIED: - if(bmk.containsKey(b.key()) && bmk.get(b.key()) != null) { - data.put(b.key(), bmk.get(b.key())); - } else { - data.put(b.key(), String.valueOf(System.currentTimeMillis()).getBytes()); - } - break; - case TAGS: - if(bmk.containsKey(b.key()) && bmk.get(b.key()) != null) { - this.tags.insertIndexEntry(bmk_user, bmk.get(b.key()), urlHash); - data.put(b.key(), bmk.get(b.key())); - } else { - this.tags.insertIndexEntry(bmk_user, b.deflt(), urlHash); - data.put(b.key(), b.deflt()); - } - break; - case FOLDERS: - if(bmk.containsKey(b.key()) && bmk.get(b.key()) != null) { - this.folders.insertIndexEntry(bmk_user, bmk.get(b.key()), urlHash); - data.put(b.key(), bmk.get(b.key())); - } else { - this.folders.insertIndexEntry(bmk_user, b.deflt(), urlHash); - data.put(b.key(), b.deflt()); - } - break; - default: - if(bmk.containsKey(b.key()) && bmk.get(b.key()) != null) { - data.put(b.key(), bmk.get(b.key())); - } - } - } - this.worktables.insert(bmk_table, urlHash, data); - } else { - // modify and update existing entry - HashSet oldSet; - HashSet newSet; - for (BOOKMARK b : BOOKMARK.values()) { - switch(b) { - case DATE_ADDED: - if(!bmk_row.containsKey(b.key)) - bmk_row.put(b.key(), date); - break; - case DATE_MODIFIED: - bmk_row.put(b.key(), date); - break; - case TAGS: - oldSet = keysStringToSet(bmk_row.get(b.key(),b.deflt())); - if(bmk.containsKey(b.key())) { - newSet = keysStringToSet(bmk.get(b.key())); - if(importer) { - newSet.addAll(oldSet); - bmk_row.put(b.key(), keySetToString(newSet)); - oldSet.clear(); - } else { - bmk_row.put(b.key, bmk.get(b.key())); - } - } else { - newSet = new HashSet(); - bmk_row.put(b.key, bmk_row.get(b.key(), b.deflt())); - } - this.tags.updateIndexEntry(bmk_user, urlHash, oldSet, newSet); - break; - case FOLDERS: - oldSet = keysStringToSet(bmk_row.get(b.key(),b.deflt())); - if(bmk.containsKey(b.key())) { - newSet = keysStringToSet(bmk.get(b.key())); - if(importer) { - newSet.addAll(oldSet); - bmk_row.put(b.key(), keySetToString(newSet)); - oldSet.clear(); - } else { - bmk_row.put(b.key, bmk.get(b.key())); - } - } else { - newSet = new HashSet(); - bmk_row.put(b.key, bmk_row.get(b.key(), b.deflt())); - } - this.folders.updateIndexEntry(bmk_user, urlHash, oldSet, newSet); - break; - default: - if(bmk.containsKey(b.key())) { - bmk_row.put(b.key, bmk.get(b.key())); - } else { - bmk_row.put(b.key, bmk_row.get(b.key(), b.deflt())); - } - } - } - // update bmk_table - this.worktables.update(bmk_table, bmk_row); - } - } - } - - public static EnumMap getMetadata(final byte[] urlHash, final Segment indexSegment) { - final EnumMap metadata = new EnumMap(METADATA.class); - final URIMetadataRow urlEntry = indexSegment.urlMetadata().load(urlHash, null, 0); - if (urlEntry != null) { - metadata.put(METADATA.IN_URLDB, "true"); - metadata.put(METADATA.SIZE, String.valueOf(urlEntry.size())); - metadata.put(METADATA.FRESHDATE, ISO8601Formatter.FORMATTER.format(urlEntry.freshdate())); - metadata.put(METADATA.LOADDATE, ISO8601Formatter.FORMATTER.format(urlEntry.loaddate())); - metadata.put(METADATA.MODDATE, ISO8601Formatter.FORMATTER.format(urlEntry.moddate())); - metadata.put(METADATA.SNIPPET, String.valueOf(urlEntry.snippet())); - metadata.put(METADATA.WORDCOUNT, String.valueOf(urlEntry.wordCount())); - metadata.put(METADATA.MIMETYPE, String.valueOf(urlEntry.doctype())); - metadata.put(METADATA.LANGUAGE, UTF8.String(urlEntry.language())); - - final URIMetadataRow.Components meta = urlEntry.metadata(); - if (meta != null) { - metadata.put(METADATA.TITLE, meta.dc_title()); - metadata.put(METADATA.CREATOR, meta.dc_creator()); - metadata.put(METADATA.KEYWORDS, meta.dc_subject()); - metadata.put(METADATA.PUBLISHER, meta.dc_publisher()); - } - } - return metadata; - } - - public static EnumMap getMetadata(final Document document) { - final EnumMap metadata = new EnumMap(METADATA.class); - metadata.put(METADATA.IN_URLDB, "false"); - if(document != null) { - metadata.put(METADATA.TITLE, document.dc_title()); - metadata.put(METADATA.CREATOR, document.dc_creator()); - metadata.put(METADATA.KEYWORDS, document.dc_subject(' ')); - metadata.put(METADATA.PUBLISHER, document.dc_publisher()); - metadata.put(METADATA.DESCRIPTION, document.dc_description()); - metadata.put(METADATA.MIMETYPE, document.dc_format()); - metadata.put(METADATA.LANGUAGE, document.dc_language()); - metadata.put(METADATA.CHARSET, document.getCharset()); - // metadata.put(METADATA.SIZE, String.valueOf(document.getTextLength())); - } - return metadata; - } - - public String autoTag(final Document document, final String bmk_user, final int count) { - final StringBuilder buffer = new StringBuilder(); - final Map words; - if(document != null) { - try { - words = new Condenser(document, true, true, LibraryProvider.dymLib).words(); - buffer.append(document.dc_title()); - buffer.append(document.dc_description()); - buffer.append(document.dc_subject(' ')); - final Enumeration tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib); - while(tokens.hasMoreElements()) { - int max = 1; - String token = tokens.nextElement(); - Word word = words.get(token); - if (words.containsKey(token)) { - if (this.worktables.has(TABLES.TAGS.tablename(bmk_user), getKeyId(token))) { - max = word.occurrences() * 1000; - } else if (token.length()>3) { - max = word.occurrences() * 100; - } - for(int i=0; i topwords = new ArrayList(sortWordCounts(words).descendingKeySet()); - for(int i=0; i 100) { - buffer.append(topwords.get(i)); - buffer.append(YMarkTables.TAGS_SEPARATOR); - } - } - } catch (UnsupportedEncodingException e) { - Log.logException(e); - } catch (IOException e) { - Log.logException(e); - } - } - return YMarkTables.cleanTagsString(buffer.toString()); - } - - public static TreeMap getWordCounts(final Document document) { - if (document != null) { - return sortWordCounts(new Condenser(document, true, true, LibraryProvider.dymLib).words()); - } - return new TreeMap(); - } - - public static TreeMap sortWordCounts(final Map unsorted_words) { - final TreeMap sorted_words = new TreeMap(new YMarkWordCountComparator(unsorted_words)); - sorted_words.putAll(unsorted_words); - return sorted_words; - } - -} diff --git a/source/de/anomic/data/YMarkWordCountComparator.java b/source/de/anomic/data/YMarkWordCountComparator.java deleted file mode 100644 index 6505bd846..000000000 --- a/source/de/anomic/data/YMarkWordCountComparator.java +++ /dev/null @@ -1,27 +0,0 @@ -package de.anomic.data; - -import java.util.Comparator; -import java.util.Map; - -import net.yacy.kelondro.data.word.Word; - -public class YMarkWordCountComparator implements Comparator { - - private Map words; - - public YMarkWordCountComparator(final Map words) { - this.words = words; - } - - public int compare(final String k1, final String k2) { - final Word w1 = this.words.get(k1); - final Word w2 = this.words.get(k2); - - if(w1.occurrences() > w2.occurrences()) - return 1; - else if(w1.occurrences() < w2.occurrences()) - return -1; - else - return 0; - } -} diff --git a/source/de/anomic/data/ymark/YMarkCrawlStart.java b/source/de/anomic/data/ymark/YMarkCrawlStart.java new file mode 100644 index 000000000..062d9b886 --- /dev/null +++ b/source/de/anomic/data/ymark/YMarkCrawlStart.java @@ -0,0 +1,90 @@ +// YMarkCrawlStart.java +// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany +// first published 2010 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2011-03-09 13:50:39 +0100 (Mi, 09 Mrz 2011) $ +// $LastChangedRevision: 7574 $ +// $LastChangedBy: apfelmaennchen $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.data.ymark; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.regex.Pattern; + +import net.yacy.kelondro.blob.Tables; + +import de.anomic.data.WorkTables; + +public class YMarkCrawlStart extends HashMap{ + + private static final long serialVersionUID = 1L; + private WorkTables worktables; + + public YMarkCrawlStart(final WorkTables worktables) { + this.worktables = worktables; + } + + public YMarkCrawlStart(final WorkTables worktables, final String url) { + this.worktables = worktables; + this.clear(); + this.load(url); + } + + public void load(String url) { + try { + final StringBuffer buffer = new StringBuffer(500); + buffer.append("^.*crawlingURL=\\Q"); + buffer.append(url); + buffer.append("\\E?.*"); + final Pattern pattern = Pattern.compile(buffer.toString()); + final Iterator APIcalls = this.worktables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_URL, pattern); + Tables.Row row = null; + while(APIcalls.hasNext()) { + row = APIcalls.next(); + if(row.get(WorkTables.TABLE_API_COL_TYPE, "").equals("crawler")) { + buffer.setLength(0); + buffer.append(row.get(WorkTables.TABLE_API_COL_URL, "")); + buffer.delete(0, buffer.indexOf("?")+1); + int start = 0; + int end = 0; + String key; + String value; + while(start < buffer.length()) { + end = buffer.indexOf("=", start); + key = buffer.substring(start, end); + start = end+1; + end = buffer.indexOf("&", start); + if(end < 0 || end > buffer.length()) + end = buffer.length()-1; + value = buffer.substring(start, end); + start = end+1; + this.put(key, value); + } + break; + } + } + } catch (IOException e) { + // TODO Auto-generated catch block + } + } +} diff --git a/source/de/anomic/data/ymark/YMarkDate.java b/source/de/anomic/data/ymark/YMarkDate.java new file mode 100644 index 000000000..5133bcedd --- /dev/null +++ b/source/de/anomic/data/ymark/YMarkDate.java @@ -0,0 +1,92 @@ +// YMarkDate.java +// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany +// first published 2010 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2011-03-09 13:50:39 +0100 (Mi, 09 Mrz 2011) $ +// $LastChangedRevision: 7574 $ +// $LastChangedBy: apfelmaennchen $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.data.ymark; + +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Date; + +import net.yacy.cora.date.ISO8601Formatter; +import net.yacy.cora.document.UTF8; + +public class YMarkDate { + + private long date; + + public YMarkDate() { + this.date = System.currentTimeMillis(); + } + + public YMarkDate(final byte[] date) { + this.set(date); + } + + public long parseISO8601(final String s) throws ParseException { + if(s == null || s.length() < 1) { + throw new ParseException("parseISO8601 - empty string, nothing to parse", 0); + } + SimpleDateFormat dateformat; + StringBuilder date = new StringBuilder(s); + if(s.length()==10) + dateformat = new SimpleDateFormat("yyyy-MM-dd"); + else { + dateformat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssz"); + if(date.charAt(date.length()-1) == 'Z') { + date.deleteCharAt(date.length()-1); + date.append("GMT-00:00"); + } else { + date.insert(date.length()-6, "GMT"); + } + } + this.date = dateformat.parse(date.toString()).getTime(); + return this.date; + } + + public String toISO8601() { + return ISO8601Formatter.FORMATTER.format(new Date(this.date)); + } + + public byte[] toBytes() { + return String.valueOf(this.date).getBytes(); + } + + public String toString() { + return String.valueOf(this.date); + } + + public long get() { + return this.date; + } + + public void set(long date) { + this.date = date; + } + + public void set(byte[] date) { + this.date = Long.parseLong(UTF8.String(date)); + } +} diff --git a/source/de/anomic/data/YMarksHTMLImporter.java b/source/de/anomic/data/ymark/YMarkHTMLImporter.java similarity index 75% rename from source/de/anomic/data/YMarksHTMLImporter.java rename to source/de/anomic/data/ymark/YMarkHTMLImporter.java index b38f2ad62..6d5c79c0b 100644 --- a/source/de/anomic/data/YMarksHTMLImporter.java +++ b/source/de/anomic/data/ymark/YMarkHTMLImporter.java @@ -1,4 +1,30 @@ -package de.anomic.data; +// YMarkHTMLImporter.java +// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany +// first published 2010 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.data.ymark; import java.io.IOException; import java.io.InputStream; @@ -14,7 +40,7 @@ import javax.swing.text.html.parser.ParserDelegator; import net.yacy.kelondro.logging.Log; -public class YMarksHTMLImporter extends HTMLEditorKit.ParserCallback implements Runnable { +public class YMarkHTMLImporter extends HTMLEditorKit.ParserCallback implements Runnable { public static enum STATE { NOTHING, @@ -35,7 +61,7 @@ public class YMarksHTMLImporter extends HTMLEditorKit.ParserCallback implements private final BlockingQueue> bookmarks; private final ParserDelegator htmlParser; - public YMarksHTMLImporter(final InputStream input, int queueSize) { + public YMarkHTMLImporter(final InputStream input, int queueSize) { this.state = STATE.NOTHING; this.prevTag = null; this.bmk = new HashMap(); @@ -76,7 +102,7 @@ public class YMarksHTMLImporter extends HTMLEditorKit.ParserCallback implements this.bmk.put(YMarkTables.BOOKMARK.VISITS.key(), YMarkTables.BOOKMARK.VISITS.deflt()); break; case FOLDER: - this.folder.append(YMarkTables.FOLDERS_SEPARATOR); + this.folder.append(YMarkUtil.FOLDERS_SEPARATOR); this.folder.append(data); break; case FOLDER_DESC: @@ -109,7 +135,7 @@ public class YMarksHTMLImporter extends HTMLEditorKit.ParserCallback implements switch(bmk) { case TAGS: // mozilla shortcuturl - this.bmk.put(bmk.key(), YMarkTables.cleanTagsString(s)); + this.bmk.put(bmk.key(), YMarkUtil.cleanTagsString(s)); break; case DATE_ADDED: case DATE_MODIFIED: @@ -138,7 +164,7 @@ public class YMarksHTMLImporter extends HTMLEditorKit.ParserCallback implements } else if (t == HTML.Tag.DL) { //TODO: get rid of .toString.equals() if(!this.folder.toString().equals(YMarkTables.FOLDERS_IMPORTED)) { - folder.setLength(folder.lastIndexOf(YMarkTables.FOLDERS_SEPARATOR)); + folder.setLength(folder.lastIndexOf(YMarkUtil.FOLDERS_SEPARATOR)); } } else { state = STATE.NOTHING; diff --git a/source/de/anomic/data/ymark/YMarkMetadata.java b/source/de/anomic/data/ymark/YMarkMetadata.java new file mode 100644 index 000000000..12eb43f80 --- /dev/null +++ b/source/de/anomic/data/ymark/YMarkMetadata.java @@ -0,0 +1,201 @@ +// YMarkMetadata.java +// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany +// first published 2010 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2011-03-09 13:50:39 +0100 (Mi, 09 Mrz 2011) $ +// $LastChangedRevision: 7574 $ +// $LastChangedBy: apfelmaennchen $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.data.ymark; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.ArrayList; +import java.util.EnumMap; +import java.util.Enumeration; +import java.util.Map; +import java.util.TreeMap; + +import net.yacy.cora.date.ISO8601Formatter; +import net.yacy.cora.document.UTF8; +import net.yacy.document.Condenser; +import net.yacy.document.Document; +import net.yacy.document.LibraryProvider; +import net.yacy.document.WordTokenizer; +import net.yacy.document.Parser.Failure; +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.word.Word; +import net.yacy.repository.LoaderDispatcher; +import de.anomic.crawler.CrawlProfile; +import de.anomic.crawler.retrieval.Response; +import de.anomic.search.Segments; + +public class YMarkMetadata { + private DigestURI uri; + Document document; + Segments indexSegment; + + public enum METADATA { + TITLE, + DESCRIPTION, + FAVICON, + KEYWORDS, + LANGUAGE, + CREATOR, + PUBLISHER, + CHARSET, + MIMETYPE, + SIZE, + WORDCOUNT, + IN_URLDB, + FRESHDATE, + LOADDATE, + MODDATE, + SNIPPET, + AUTOTAG + } + + public YMarkMetadata(final DigestURI uri) { + this.uri = uri; + this.document = null; + this.indexSegment = null; + } + + public YMarkMetadata(final DigestURI uri, final Segments indexSegment) { + this.uri = uri; + this.document = null; + this.indexSegment = indexSegment; + } + + public YMarkMetadata(final Document document) { + this.document = document; + try { + this.uri = new DigestURI(this.document.dc_identifier()); + } catch (MalformedURLException e) { + this.uri = null; + } + this.indexSegment = null; + } + + public void loadDocument(LoaderDispatcher loader) throws IOException, Failure { + if(document == null) { + Response response = null; + response = loader.load(loader.request(this.uri, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE, true); + this.document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); + } + } + + public EnumMap getMetadata() { + final EnumMap metadata = new EnumMap(METADATA.class); + final URIMetadataRow urlEntry = this.indexSegment.segment(Segments.Process.PUBLIC).urlMetadata().load(this.uri.hash(), null, 0); + if (urlEntry != null) { + metadata.put(METADATA.SIZE, String.valueOf(urlEntry.size())); + metadata.put(METADATA.FRESHDATE, ISO8601Formatter.FORMATTER.format(urlEntry.freshdate())); + metadata.put(METADATA.LOADDATE, ISO8601Formatter.FORMATTER.format(urlEntry.loaddate())); + metadata.put(METADATA.MODDATE, ISO8601Formatter.FORMATTER.format(urlEntry.moddate())); + metadata.put(METADATA.SNIPPET, String.valueOf(urlEntry.snippet())); + metadata.put(METADATA.WORDCOUNT, String.valueOf(urlEntry.wordCount())); + metadata.put(METADATA.MIMETYPE, String.valueOf(urlEntry.doctype())); + metadata.put(METADATA.LANGUAGE, UTF8.String(urlEntry.language())); + + final URIMetadataRow.Components meta = urlEntry.metadata(); + if (meta != null) { + metadata.put(METADATA.TITLE, meta.dc_title()); + metadata.put(METADATA.CREATOR, meta.dc_creator()); + metadata.put(METADATA.KEYWORDS, meta.dc_subject()); + metadata.put(METADATA.PUBLISHER, meta.dc_publisher()); + } + } + return metadata; + } + + public EnumMap loadMetadata() { + final EnumMap metadata = new EnumMap(METADATA.class); + if(this.document != null) { + metadata.put(METADATA.TITLE, this.document.dc_title()); + metadata.put(METADATA.CREATOR, this.document.dc_creator()); + metadata.put(METADATA.KEYWORDS, this.document.dc_subject(' ')); + metadata.put(METADATA.PUBLISHER, this.document.dc_publisher()); + metadata.put(METADATA.DESCRIPTION, this.document.dc_description()); + metadata.put(METADATA.MIMETYPE, this.document.dc_format()); + metadata.put(METADATA.LANGUAGE, this.document.dc_language()); + metadata.put(METADATA.CHARSET, this.document.getCharset()); + // metadata.put(METADATA.SIZE, String.valueOf(document.getTextLength())); + metadata.put(METADATA.AUTOTAG, this.autoTag(5)); + } + return metadata; + } + + public String autoTag(final int count) { + final StringBuilder buffer = new StringBuilder(); + final Map words; + if(this.document != null) { + words = new Condenser(this.document, true, true, LibraryProvider.dymLib).words(); + buffer.append(this.document.dc_title()); + buffer.append(this.document.dc_description()); + buffer.append(this.document.dc_subject(' ')); + final Enumeration tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib); + while(tokens.hasMoreElements()) { + int max = 1; + String token = tokens.nextElement(); + Word word = words.get(token); + if (words.containsKey(token)) { + /* + if (this.worktables.has(TABLES.TAGS.tablename(bmk_user), YMarkUtil.getKeyId(token))) { + max = word.occurrences() * 1000; + } else + */ + if (token.length()>3) { + max = word.occurrences() * 100; + } + for(int i=0; i topwords = new ArrayList(sortWordCounts(words).descendingKeySet()); + for(int i=0; i 100) { + buffer.append(topwords.get(i)); + buffer.append(YMarkUtil.TAGS_SEPARATOR); + } + } + } + return YMarkUtil.cleanTagsString(buffer.toString()); + } + + public TreeMap getWordCounts() { + if (this.document != null) { + return sortWordCounts(new Condenser(this.document, true, true, LibraryProvider.dymLib).words()); + } + return new TreeMap(); + } + + public static TreeMap sortWordCounts(final Map unsorted_words) { + final TreeMap sorted_words = new TreeMap(new YMarkWordCountComparator(unsorted_words)); + sorted_words.putAll(unsorted_words); + return sorted_words; + } + +} diff --git a/source/de/anomic/data/ymark/YMarkTables.java b/source/de/anomic/data/ymark/YMarkTables.java new file mode 100644 index 000000000..94a2ec91f --- /dev/null +++ b/source/de/anomic/data/ymark/YMarkTables.java @@ -0,0 +1,340 @@ +// YMarkTables.java +// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany +// first published 2010 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.data.ymark; + +import java.io.IOException; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeSet; +import java.util.regex.Pattern; + +import de.anomic.data.WorkTables; + +import net.yacy.kelondro.blob.Tables; +import net.yacy.kelondro.blob.Tables.Data; +import net.yacy.kelondro.index.RowSpaceExceededException; + +public class YMarkTables { + + public static enum TABLES { + BOOKMARKS ("_bookmarks"), + TAGS ("_tags"), + FOLDERS ("_folders"); + + private String basename; + + private TABLES(String b) { + this.basename = b; + } + public String basename() { + return this.basename; + } + public String tablename(String bmk_user) { + return bmk_user+this.basename; + } + } + + public static enum PROTOCOLS { + HTTP ("http://"), + HTTPS ("https://"); + + private String protocol; + + private PROTOCOLS(String s) { + this.protocol = s; + } + public String protocol() { + return this.protocol; + } + public String protocol(String s) { + return this.protocol+s; + } + } + + public static enum BOOKMARK { + // key dflt html_attrb xbel_attrb type + URL ("url", "", "href", "href", "link"), + TITLE ("title", "", "", "", "meta"), + DESC ("desc", "", "", "", "comment"), + DATE_ADDED ("date_added", "", "add_date", "added", "date"), + DATE_MODIFIED ("date_modified", "", "last_modified", "modified", "date"), + DATE_VISITED ("date_visited", "", "last_visited", "visited", "date"), + PUBLIC ("public", "flase", "", "yacy:public", "lock"), + TAGS ("tags", "unsorted", "shortcuturl", "yacy:tags", "tag"), + VISITS ("visits", "0", "", "yacy:visits", "stat"), + FOLDERS ("folders", "/unsorted", "", "", "folder"); + + private String key; + private String dflt; + private String html_attrb; + private String xbel_attrb; + private String type; + + private static final Map lookup = new HashMap(); + static { + for(BOOKMARK b : EnumSet.allOf(BOOKMARK.class)) + lookup.put(b.key(), b); + } + + private static StringBuilder buffer = new StringBuilder(25);; + + private BOOKMARK(String k, String s, String a, String x, String t) { + this.key = k; + this.dflt = s; + this.html_attrb = a; + this.xbel_attrb = x; + this.type = t; + } + public static BOOKMARK get(String key) { + return lookup.get(key); + } + public static boolean contains(String key) { + return lookup.containsKey(key); + } + public String key() { + return this.key; + } + public String deflt() { + return this.dflt; + } + public String html_attrb() { + return this.html_attrb; + } + public String xbel_attrb() { + return this.xbel_attrb; + } + public String xbel() { + buffer.setLength(0); + buffer.append('"'); + buffer.append('\n'); + buffer.append(' '); + buffer.append(this.xbel_attrb); + buffer.append('='); + buffer.append('"'); + return buffer.toString(); + } + public String type() { + return this.type; + } + } + + public final static HashMap POISON = new HashMap(); + + public final static String FOLDERS_ROOT = "/"; + public final static String FOLDERS_UNSORTED = "/unsorted"; + public final static String FOLDERS_IMPORTED = "/imported"; + public static final int FOLDER_BUFFER_SIZE = 100; + + public final static String BOOKMARKS_LOG = "BOOKMARKS"; + public final static String BOOKMARKS_ID = "id"; + + public final static String USER_ADMIN = "admin"; + public final static String USER_AUTHENTICATE = "AUTHENTICATE"; + public final static String USER_AUTHENTICATE_MSG = "Authentication required!"; + + private WorkTables worktables; + + public YMarkTables(final Tables wt) { + this.worktables = (WorkTables)wt; + } + + public void deleteBookmark(final String bmk_user, final byte[] urlHash) throws IOException, RowSpaceExceededException { + final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); + Tables.Row bmk_row = null; + bmk_row = this.worktables.select(bmk_table, urlHash); + if(bmk_row != null) { + this.worktables.delete(bmk_table,urlHash); + } + } + + public void deleteBookmark(final String bmk_user, final String url) throws IOException, RowSpaceExceededException { + this.deleteBookmark(bmk_user, YMarkUtil.getBookmarkId(url)); + } + + public TreeSet getFolders(final String bmk_user, final String root) throws IOException { + final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); + final Pattern r = Pattern.compile("(?:^|.*,)("+root+"/.*)(?:,|$)"); + final Iterator bit = this.worktables.iterator(bmk_table, YMarkTables.BOOKMARK.FOLDERS.key(), r); + final TreeSet folders = new TreeSet(); + final StringBuilder path = new StringBuilder(200); + Tables.Row bmk_row = null; + while(bit.hasNext()) { + bmk_row = bit.next(); + if(bmk_row.containsKey(BOOKMARK.FOLDERS.key())) { + final String[] folderArray = (new String(bmk_row.get(BOOKMARK.FOLDERS.key()),"UTF8")).split(YMarkUtil.TAGS_SEPARATOR); + for (final String folder : folderArray) { + if(folder.startsWith(root)) { + if(!folders.contains(folder)) { + path.setLength(0); + path.append(folder); + //TODO: get rid of .toString.equals() + while(path.length() > 0 && !path.toString().equals(root)){ + folders.add(path.toString()); + path.setLength(path.lastIndexOf(YMarkUtil.FOLDERS_SEPARATOR)); + } + } + } + } + } + } + if (!root.equals(YMarkTables.FOLDERS_ROOT)) { folders.add(root); } + return folders; + } + + public Iterator getBookmarksByFolder(final String bmk_user, final String folder) throws IOException { + final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); + final StringBuffer buffer = new StringBuffer(folder.length()+30); + buffer.append("(?:^|.*,)(\\Q"); + buffer.append(folder); + buffer.append("\\E)(?:,|$)"); + final Pattern p = Pattern.compile(buffer.toString()); + return this.worktables.iterator(bmk_table, YMarkTables.BOOKMARK.FOLDERS.key(), p); + } + + public Iterator getBookmarksByTag(final String bmk_user, final String[] tagArray) throws IOException { + // "(?:^|.*,)((?:tag4|tag2|tag5),*.*){3}" + final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); + final StringBuffer buffer = new StringBuffer((tagArray.length * 25)+25); + buffer.append("(?:^|.*,)((?:"); + for (final String tag : tagArray) { + buffer.append("\\Q"); + buffer.append(tag); + buffer.append("\\E"); + buffer.append("|"); + } + buffer.deleteCharAt(buffer.length()-1); + buffer.append("),*.*){"); + buffer.append(tagArray.length); + buffer.append("}"); + final Pattern p = Pattern.compile(buffer.toString()); + return this.worktables.iterator(bmk_table, YMarkTables.BOOKMARK.TAGS.key(), p); + } + + public void addBookmark(final String bmk_user, final HashMap bmk, final boolean importer) throws IOException, RowSpaceExceededException { + final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); + final String date = String.valueOf(System.currentTimeMillis()); + final byte[] urlHash = YMarkUtil.getBookmarkId(bmk.get(BOOKMARK.URL.key())); + Tables.Row bmk_row = null; + + if (urlHash != null) { + bmk_row = this.worktables.select(bmk_table, urlHash); + if (bmk_row == null) { + // create and insert new entry + final Data data = new Data(); + for (BOOKMARK b : BOOKMARK.values()) { + switch(b) { + case DATE_ADDED: + case DATE_MODIFIED: + if(bmk.containsKey(b.key()) && bmk.get(b.key()) != null) { + data.put(b.key(), bmk.get(b.key())); + } else { + data.put(b.key(), String.valueOf(System.currentTimeMillis()).getBytes()); + } + break; + case TAGS: + if(bmk.containsKey(b.key()) && bmk.get(b.key()) != null) { + data.put(b.key(), bmk.get(b.key())); + } else { + data.put(b.key(), b.deflt()); + } + break; + case FOLDERS: + if(bmk.containsKey(b.key()) && bmk.get(b.key()) != null) { + data.put(b.key(), bmk.get(b.key())); + } else { + data.put(b.key(), b.deflt()); + } + break; + default: + if(bmk.containsKey(b.key()) && bmk.get(b.key()) != null) { + data.put(b.key(), bmk.get(b.key())); + } + } + } + this.worktables.insert(bmk_table, urlHash, data); + } else { + // modify and update existing entry + HashSet oldSet; + HashSet newSet; + for (BOOKMARK b : BOOKMARK.values()) { + switch(b) { + case DATE_ADDED: + if(!bmk_row.containsKey(b.key)) + bmk_row.put(b.key(), date); + break; + case DATE_MODIFIED: + bmk_row.put(b.key(), date); + break; + case TAGS: + oldSet = YMarkUtil.keysStringToSet(bmk_row.get(b.key(),b.deflt())); + if(bmk.containsKey(b.key())) { + newSet = YMarkUtil.keysStringToSet(bmk.get(b.key())); + if(importer) { + newSet.addAll(oldSet); + bmk_row.put(b.key(), YMarkUtil.keySetToString(newSet)); + oldSet.clear(); + } else { + bmk_row.put(b.key, bmk.get(b.key())); + } + } else { + newSet = new HashSet(); + bmk_row.put(b.key, bmk_row.get(b.key(), b.deflt())); + } + break; + case FOLDERS: + oldSet = YMarkUtil.keysStringToSet(bmk_row.get(b.key(),b.deflt())); + if(bmk.containsKey(b.key())) { + newSet = YMarkUtil.keysStringToSet(bmk.get(b.key())); + if(importer) { + newSet.addAll(oldSet); + bmk_row.put(b.key(), YMarkUtil.keySetToString(newSet)); + oldSet.clear(); + } else { + bmk_row.put(b.key, bmk.get(b.key())); + } + } else { + newSet = new HashSet(); + bmk_row.put(b.key, bmk_row.get(b.key(), b.deflt())); + } + break; + default: + if(bmk.containsKey(b.key())) { + bmk_row.put(b.key, bmk.get(b.key())); + } else { + bmk_row.put(b.key, bmk_row.get(b.key(), b.deflt())); + } + } + } + // update bmk_table + this.worktables.update(bmk_table, bmk_row); + } + } + } +} diff --git a/source/de/anomic/data/ymark/YMarkUtil.java b/source/de/anomic/data/ymark/YMarkUtil.java new file mode 100644 index 000000000..86c7c7a20 --- /dev/null +++ b/source/de/anomic/data/ymark/YMarkUtil.java @@ -0,0 +1,114 @@ +// YMarkUtil.java +// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany +// first published 2010 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2011-03-09 13:50:39 +0100 (Mi, 09 Mrz 2011) $ +// $LastChangedRevision: 7574 $ +// $LastChangedBy: apfelmaennchen $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.data.ymark; + +import java.net.MalformedURLException; +import java.util.HashSet; +import java.util.Iterator; + +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.data.word.Word; + +public class YMarkUtil { + public final static String TAGS_SEPARATOR = ","; + public final static String FOLDERS_SEPARATOR = "/"; + + public final static byte[] getBookmarkId(String url) throws MalformedURLException { + return (new DigestURI(url, null)).hash(); + } + + public final static byte[] getKeyId(final String tag) { + return Word.word2hash(tag.toLowerCase()); + } + + public final static byte[] keySetToBytes(final HashSet urlSet) { + return keySetToString(urlSet).getBytes(); + } + + public final static String keySetToString(final HashSet urlSet) { + final Iterator urlIter = urlSet.iterator(); + final + StringBuilder urls = new StringBuilder(urlSet.size()*20); + while(urlIter.hasNext()) { + urls.append(TAGS_SEPARATOR); + urls.append(urlIter.next()); + } + urls.deleteCharAt(0); + return urls.toString(); + } + + public final static HashSet keysStringToSet(final String keysString) { + HashSet keySet = new HashSet(); + final String[] keyArray = keysString.split(TAGS_SEPARATOR); + for (final String key : keyArray) { + keySet.add(key); + } + return keySet; + } + + public final static String cleanTagsString(final String tagsString) { + StringBuilder ts = new StringBuilder(tagsString); + if(ts.length() == 0) + return YMarkTables.BOOKMARK.TAGS.deflt(); + // get rid of double commas and space characters following a comma + for (int i = 0; i < ts.length()-1; i++) { + if (ts.charAt(i) == TAGS_SEPARATOR.charAt(0)) { + if (ts.charAt(i+1) == TAGS_SEPARATOR.charAt(0) || ts.charAt(i+1) == ' ') { + ts.deleteCharAt(i+1); + i--; + } + } + } + // get rid of heading and trailing comma + if (ts.charAt(0) == TAGS_SEPARATOR.charAt(0)) + ts.deleteCharAt(0); + if (ts.charAt(ts.length()-1) == TAGS_SEPARATOR.charAt(0)) + ts.deleteCharAt(ts.length()-1); + return ts.toString(); + } + + public final static String cleanFoldersString(final String foldersString) { + StringBuilder fs = new StringBuilder(cleanTagsString(foldersString)); + if(fs.length() == 0) + return YMarkTables.BOOKMARK.FOLDERS.deflt(); + for (int i = 0; i < fs.length()-1; i++) { + if (fs.charAt(i) == FOLDERS_SEPARATOR.charAt(0)) { + if (fs.charAt(i+1) == TAGS_SEPARATOR.charAt(0) || fs.charAt(i+1) == FOLDERS_SEPARATOR.charAt(0)) { + fs.deleteCharAt(i); + i--; + } else if (fs.charAt(i+1) == ' ') { + fs.deleteCharAt(i+1); + i--; + } + } + } + if (fs.charAt(fs.length()-1) == FOLDERS_SEPARATOR.charAt(0)) { + fs.deleteCharAt(fs.length()-1); + } + return fs.toString(); + } +} diff --git a/source/de/anomic/data/ymark/YMarkWordCountComparator.java b/source/de/anomic/data/ymark/YMarkWordCountComparator.java new file mode 100644 index 000000000..8ba178369 --- /dev/null +++ b/source/de/anomic/data/ymark/YMarkWordCountComparator.java @@ -0,0 +1,53 @@ +// YMarkWordCountComparator.java +// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany +// first published 2010 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.data.ymark; + +import java.util.Comparator; +import java.util.Map; + +import net.yacy.kelondro.data.word.Word; + +public class YMarkWordCountComparator implements Comparator { + + private Map words; + + public YMarkWordCountComparator(final Map words) { + this.words = words; + } + + public int compare(final String k1, final String k2) { + final Word w1 = this.words.get(k1); + final Word w2 = this.words.get(k2); + + if(w1.occurrences() > w2.occurrences()) + return 1; + else if(w1.occurrences() < w2.occurrences()) + return -1; + else + return 0; + } +} diff --git a/source/de/anomic/data/YMarksXBELImporter.java b/source/de/anomic/data/ymark/YMarkXBELImporter.java similarity index 80% rename from source/de/anomic/data/YMarksXBELImporter.java rename to source/de/anomic/data/ymark/YMarkXBELImporter.java index 015fe6f7e..85debaeba 100644 --- a/source/de/anomic/data/YMarksXBELImporter.java +++ b/source/de/anomic/data/ymark/YMarkXBELImporter.java @@ -1,4 +1,30 @@ -package de.anomic.data; +// YMarkXBELImporter.java +// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany +// first published 2010 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.data.ymark; import java.io.IOException; import java.io.InputStream; @@ -17,7 +43,7 @@ import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; -public class YMarksXBELImporter extends DefaultHandler implements Runnable { +public class YMarkXBELImporter extends DefaultHandler implements Runnable { public static enum XBEL { NOTHING (""), @@ -76,7 +102,7 @@ public class YMarksXBELImporter extends DefaultHandler implements Runnable { private final XMLReader xmlReader; private final String RootFolder; - public YMarksXBELImporter (final InputStream input, int queueSize, String root) throws SAXException { + public YMarkXBELImporter (final InputStream input, int queueSize, String root) throws SAXException { this.bmk = null; this.RootFolder = root; @@ -126,30 +152,30 @@ public class YMarksXBELImporter extends DefaultHandler implements Runnable { } public void startElement(final String uri, final String name, String tag, final Attributes atts) throws SAXException { - String date; + YMarkDate date = new YMarkDate(); if (tag == null) return; tag = tag.toLowerCase(); if (XBEL.BOOKMARK.tag().equals(tag)) { this.bmk = new HashMap(); this.bmk.put(YMarkTables.BOOKMARK.URL.key(), atts.getValue(uri, YMarkTables.BOOKMARK.URL.xbel_attrb())); try { - date = String.valueOf(YMarkTables.parseISO8601(atts.getValue(uri, YMarkTables.BOOKMARK.DATE_ADDED.xbel_attrb())).getTime()); + date.parseISO8601(atts.getValue(uri, YMarkTables.BOOKMARK.DATE_ADDED.xbel_attrb())); } catch (ParseException e) { - date = String.valueOf(System.currentTimeMillis()); + // TODO: exception handling } - this.bmk.put(YMarkTables.BOOKMARK.DATE_ADDED.key(), date); + this.bmk.put(YMarkTables.BOOKMARK.DATE_ADDED.key(), date.toString()); try { - date = String.valueOf(YMarkTables.parseISO8601(atts.getValue(uri, YMarkTables.BOOKMARK.DATE_VISITED.xbel_attrb())).getTime()); + date.parseISO8601(atts.getValue(uri, YMarkTables.BOOKMARK.DATE_VISITED.xbel_attrb())); } catch (ParseException e) { - date = YMarkTables.BOOKMARK.DATE_VISITED.deflt(); + // TODO: exception handling } - this.bmk.put(YMarkTables.BOOKMARK.DATE_VISITED.key(), date); + this.bmk.put(YMarkTables.BOOKMARK.DATE_VISITED.key(), date.toString()); try { - date = String.valueOf(YMarkTables.parseISO8601(atts.getValue(uri, YMarkTables.BOOKMARK.DATE_MODIFIED.xbel_attrb())).getTime()); + date.parseISO8601(atts.getValue(uri, YMarkTables.BOOKMARK.DATE_MODIFIED.xbel_attrb())); } catch (ParseException e) { - date = String.valueOf(System.currentTimeMillis()); + // TODO: exception handling } - this.bmk.put(YMarkTables.BOOKMARK.DATE_MODIFIED.key(), date); + this.bmk.put(YMarkTables.BOOKMARK.DATE_MODIFIED.key(), date.toString()); UpdateBmkRef(atts.getValue(uri, "id"), true); outer_state = XBEL.BOOKMARK; inner_state = XBEL.NOTHING; @@ -201,7 +227,7 @@ public class YMarksXBELImporter extends DefaultHandler implements Runnable { // go up one folder //TODO: get rid of .toString.equals() if(!this.folder.toString().equals(this.RootFolder)) { - folder.setLength(folder.lastIndexOf(YMarkTables.FOLDERS_SEPARATOR)); + folder.setLength(folder.lastIndexOf(YMarkUtil.FOLDERS_SEPARATOR)); } this.outer_state = XBEL.FOLDER; } else if (XBEL.INFO.tag().equals(tag)) { @@ -213,15 +239,15 @@ public class YMarksXBELImporter extends DefaultHandler implements Runnable { public void characters(final char ch[], final int start, final int length) { if (parse_value) { - buffer.append(ch, start, length); - switch(outer_state) { + buffer.append(ch, start, length); + switch(outer_state) { case BOOKMARK: switch(inner_state) { - case DESC: - this.bmk.put(YMarkTables.BOOKMARK.DESC.key(), this.buffer.toString()); + case DESC: + this.bmk.put(YMarkTables.BOOKMARK.DESC.key(), buffer.toString()); break; case TITLE: - this.bmk.put(YMarkTables.BOOKMARK.TITLE.key(), this.buffer.toString()); + this.bmk.put(YMarkTables.BOOKMARK.TITLE.key(), buffer.toString()); break; case METADATA: // TODO: handle xbel bookmark metadata @@ -235,7 +261,7 @@ public class YMarksXBELImporter extends DefaultHandler implements Runnable { case DESC: break; case TITLE: - this.folder.append(YMarkTables.FOLDERS_SEPARATOR); + this.folder.append(YMarkUtil.FOLDERS_SEPARATOR); this.folder.append(this.buffer); break; case METADATA: