From d31a632951a29d3d7a79b476b37a168182d5d32e Mon Sep 17 00:00:00 2001 From: apfelmaennchen Date: Sun, 9 Sep 2012 09:53:58 +0200 Subject: [PATCH 1/2] - added dmoz RDF dump importer - added indexing to Tables columns to support larger bookmark collections - added RDF output (HTTP) for public bookmarks at /YMarks.rdf - YMarkRDF also provides a Jena RDF Model as "internal" API - various other changes/fixes for YMarks (mainly backend) --- htroot/Table_YMark_p.java | 4 +- htroot/YMarks.html | 14 +- htroot/YMarks.java | 61 ++- htroot/YMarks.rdf | 1 + htroot/api/ymarks/get_tags.java | 8 +- htroot/api/ymarks/get_treeview.java | 65 ++- htroot/api/ymarks/get_xbel.java | 7 +- htroot/api/ymarks/get_ymark.java | 8 +- htroot/api/ymarks/import_ymark.java | 158 +++--- htroot/api/ymarks/manage_tags.java | 8 +- htroot/js/yacy-ymarks-bookmark-actions.js | 2 +- htroot/js/yacy-ymarks.js | 57 ++- .../de/anomic/data/ymark/MonitoredReader.java | 102 ++++ .../de/anomic/data/ymark/YMarkAutoTagger.java | 28 +- .../de/anomic/data/ymark/YMarkCrawlStart.java | 71 ++- .../anomic/data/ymark/YMarkDMOZImporter.java | 152 ++++++ source/de/anomic/data/ymark/YMarkEntry.java | 118 ++++- .../anomic/data/ymark/YMarkHTMLImporter.java | 274 +++++------ .../de/anomic/data/ymark/YMarkImporter.java | 157 ++++++ source/de/anomic/data/ymark/YMarkRDF.java | 114 +++++ source/de/anomic/data/ymark/YMarkTables.java | 212 +++++--- source/de/anomic/data/ymark/YMarkUtil.java | 24 +- .../anomic/data/ymark/YMarkXBELImporter.java | 460 +++++++++--------- .../yacy/cora/lod/vocabulary/AnnoteaA.java | 92 ++++ .../yacy/cora/lod/vocabulary/AnnoteaB.java | 97 ++++ .../yacy/cora/lod/vocabulary/DCElements.java | 76 +++ source/net/yacy/cora/lod/vocabulary/DMOZ.java | 104 ++++ source/net/yacy/cora/lod/vocabulary/Rdf.java | 3 +- source/net/yacy/kelondro/blob/Tables.java | 150 +++++- .../kelondro/blob/TablesColumnBLOBIndex.java | 205 ++++++++ .../yacy/kelondro/blob/TablesColumnIndex.java | 176 +++++++ .../kelondro/blob/TablesColumnRAMIndex.java | 124 +++++ 32 files changed, 2450 insertions(+), 682 deletions(-) create mode 100644 htroot/YMarks.rdf create mode 100644 source/de/anomic/data/ymark/MonitoredReader.java create mode 100644 source/de/anomic/data/ymark/YMarkDMOZImporter.java create mode 100644 source/de/anomic/data/ymark/YMarkImporter.java create mode 100644 source/de/anomic/data/ymark/YMarkRDF.java create mode 100644 source/net/yacy/cora/lod/vocabulary/AnnoteaA.java create mode 100644 source/net/yacy/cora/lod/vocabulary/AnnoteaB.java create mode 100644 source/net/yacy/cora/lod/vocabulary/DCElements.java create mode 100644 source/net/yacy/cora/lod/vocabulary/DMOZ.java create mode 100644 source/net/yacy/kelondro/blob/TablesColumnBLOBIndex.java create mode 100644 source/net/yacy/kelondro/blob/TablesColumnIndex.java create mode 100644 source/net/yacy/kelondro/blob/TablesColumnRAMIndex.java diff --git a/htroot/Table_YMark_p.java b/htroot/Table_YMark_p.java index 6ae16a9d9..44e0ab02e 100644 --- a/htroot/Table_YMark_p.java +++ b/htroot/Table_YMark_p.java @@ -242,8 +242,8 @@ public class Table_YMark_p { mapIterator = sb.tables.bookmarks.getBookmarksByFolder(bmk_user, post.get("folders")); } else if(post.containsKey("tags") && !post.get("tags").isEmpty()) { // mapIterator = sb.tables.orderByPK(sb.tables.bookmarks.tags.getBookmarks(bmk_user, post.get("tags")), maxcount).iterator(); - final String[] tagArray = YMarkUtil.cleanTagsString(post.get(YMarkEntry.BOOKMARK.TAGS.key())).split(YMarkUtil.TAGS_SEPARATOR); - mapIterator = sb.tables.bookmarks.getBookmarksByTag(bmk_user, tagArray); + final String tagsString = YMarkUtil.cleanTagsString(post.get(YMarkEntry.BOOKMARK.TAGS.key())); + mapIterator = sb.tables.bookmarks.getBookmarksByTag(bmk_user, tagsString); } else { mapIterator = sb.tables.orderByPK(sb.tables.iterator(table, matcher), maxcount).iterator(); } diff --git a/htroot/YMarks.html b/htroot/YMarks.html index 8b1c42e11..b61185cfe 100644 --- a/htroot/YMarks.html +++ b/htroot/YMarks.html @@ -37,7 +37,7 @@ To see a list of all APIs, please visit the -

Bookmarks

+

Bookmarks (user: #[user]# size: #[size]#)

-
+ +
diff --git a/htroot/YMarks.java b/htroot/YMarks.java index a99fcbf5c..193cf6f84 100644 --- a/htroot/YMarks.java +++ b/htroot/YMarks.java @@ -1,27 +1,80 @@ +import java.io.IOException; +import java.util.Iterator; + +import net.yacy.cora.document.UTF8; +import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.util.SpaceExceededException; +import net.yacy.kelondro.blob.Tables; +import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; import de.anomic.data.UserDB; +import de.anomic.data.ymark.YMarkEntry; +import de.anomic.data.ymark.YMarkRDF; import de.anomic.data.ymark.YMarkTables; +import de.anomic.data.ymark.YMarkTables.TABLES; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; public class YMarks { - public static serverObjects respond(final RequestHeader header, @SuppressWarnings("unused") final serverObjects post, final serverSwitch env) { + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { final Switchboard sb = (Switchboard) env; final serverObjects prop = new serverObjects(); final UserDB.Entry user = sb.userDB.getUser(header); final boolean isAdmin = (sb.verifyAuthentication(header)); final boolean isAuthUser = user!= null && user.hasRight(UserDB.AccessRight.BOOKMARK_RIGHT); + final String path = header.get(HeaderFramework.CONNECTION_PROP_PATH); + if(path != null && path.endsWith(".rdf")) { + YMarkRDF rdf = new YMarkRDF("http://"+sb.peers.myAlternativeAddress()); + + if(post != null && post.containsKey(YMarkEntry.BOOKMARKS_ID)) { + final String id = post.get(YMarkEntry.BOOKMARKS_ID); + final int i = id.indexOf(':'); + final String bmk_user = id.substring(0,i); + final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); + final byte[] urlHash = UTF8.getBytes(id.substring(i+1, id.length())); + Tables.Row bmk_row; + try { + bmk_row = sb.tables.select(bmk_table, urlHash); + rdf.addBookmark(bmk_user, bmk_row); + } catch (IOException e) { + } catch (SpaceExceededException e) { + } + } else { + final Iterator iter = sb.tables.iterator(); + while(iter.hasNext()) { + final String bmk_table = iter.next(); + final int i = bmk_table.indexOf(TABLES.BOOKMARKS.basename()); + if(i > 0) { + final String bmk_user = bmk_table.substring(0, i); + try { + // TODO select only public bookmarks + rdf.addBookmarks(bmk_user, sb.tables.iterator(bmk_table)); + } catch (IOException e) { + // TODO exception handling + } + } + } + } + prop.put("rdf", rdf.getRDF("RDF/XML")); + return prop; + } if(isAdmin || isAuthUser) { prop.put("login", 1); final String bmk_user = (isAuthUser ? user.getUserName() : YMarkTables.USER_ADMIN); prop.putHTML("user", bmk_user.substring(0,1).toUpperCase() + bmk_user.substring(1)); - + int size; + try { + size = sb.tables.bookmarks.getSize(bmk_user); + } catch (IOException e) { + Log.logException(e); + size = 0; + } + prop.put("size", size); } else { prop.put("login", 0); - } - + } return prop; } } \ No newline at end of file diff --git a/htroot/YMarks.rdf b/htroot/YMarks.rdf new file mode 100644 index 000000000..0480a63e7 --- /dev/null +++ b/htroot/YMarks.rdf @@ -0,0 +1 @@ +#[rdf]# \ No newline at end of file diff --git a/htroot/api/ymarks/get_tags.java b/htroot/api/ymarks/get_tags.java index df7a00993..e78cca69a 100644 --- a/htroot/api/ymarks/get_tags.java +++ b/htroot/api/ymarks/get_tags.java @@ -46,12 +46,8 @@ public class get_tags { YMarkTag t; if (post != null && post.containsKey(TAG) && !post.get(TAG).isEmpty()) { - final String[] tagArray = YMarkUtil.cleanTagsString(post.get(TAG)).split(YMarkUtil.TAGS_SEPARATOR); - try { - tags = new TreeSet(sb.tables.bookmarks.getTags(sb.tables.bookmarks.getBookmarksByTag(bmk_user, tagArray)).values()); - } catch (final IOException e) { - return prop; - } + final String tagsString = YMarkUtil.cleanTagsString(post.get(TAG)); + tags = new TreeSet(sb.tables.bookmarks.getTags(sb.tables.bookmarks.getBookmarksByTag(bmk_user, tagsString)).values()); } else { try { tags = new TreeSet(sb.tables.bookmarks.getTags(bmk_user).values()); diff --git a/htroot/api/ymarks/get_treeview.java b/htroot/api/ymarks/get_treeview.java index e01bc8d8c..fc7b5fcbb 100644 --- a/htroot/api/ymarks/get_treeview.java +++ b/htroot/api/ymarks/get_treeview.java @@ -119,43 +119,38 @@ public class get_treeview { count++; } } - // loop through bookmarkList - try { - if(displayBmk && !root.isEmpty()) { - bit = sb.tables.bookmarks.getBookmarksByFolder(bmk_user, root); - while (bit.hasNext()) { - bmk_row = bit.next(); - if(bmk_row != null) { - final String url = UTF8.String(bmk_row.get(YMarkEntry.BOOKMARK.URL.key())); - final String title = bmk_row.get(YMarkEntry.BOOKMARK.TITLE.key(), YMarkEntry.BOOKMARK.TITLE.deflt()); + if(displayBmk && !root.isEmpty()) { + bit = sb.tables.bookmarks.getBookmarksByFolder(bmk_user, root); + while (bit.hasNext()) { + bmk_row = bit.next(); + if(bmk_row != null) { + final String url = UTF8.String(bmk_row.get(YMarkEntry.BOOKMARK.URL.key())); + final String title = bmk_row.get(YMarkEntry.BOOKMARK.TITLE.key(), YMarkEntry.BOOKMARK.TITLE.deflt()); - // TODO: get_treeview - get rid of bmtype - if (post.containsKey("bmtype")) { - if (post.get("bmtype").equals("title")) { - prop.putJSON("folders_"+count+"_foldername", title); - } else if (post.get("bmtype").equals("href")) { - prop.putJSON("folders_"+count+"_foldername", ""+title+""); - } - } else { - prop.putJSON("folders_"+count+"_foldername", url); - } - prop.put("folders_"+count+"_expanded", "false"); - prop.put("folders_"+count+"_url", url); - prop.put("folders_"+count+"_type", "file"); - prop.put("folders_"+count+"_hash", "b:"+new String(bmk_row.getPK())); - prop.put("folders_"+count+"_hasChildren", "true"); - prop.put("folders_"+count+"_comma", ","); - count++; - } - } - } - count--; - prop.put("folders_"+count+"_comma", ""); - count++; - prop.put("folders", count); - } catch (final IOException e) { - Log.logException(e); + // TODO: get_treeview - get rid of bmtype + if (post.containsKey("bmtype")) { + if (post.get("bmtype").equals("title")) { + prop.putJSON("folders_"+count+"_foldername", title); + } else if (post.get("bmtype").equals("href")) { + prop.putJSON("folders_"+count+"_foldername", ""+title+""); + } + } else { + prop.putJSON("folders_"+count+"_foldername", url); + } + prop.put("folders_"+count+"_expanded", "false"); + prop.put("folders_"+count+"_url", url); + prop.put("folders_"+count+"_type", "file"); + prop.put("folders_"+count+"_hash", "b:"+new String(bmk_row.getPK())); + prop.put("folders_"+count+"_hasChildren", "true"); + prop.put("folders_"+count+"_comma", ","); + count++; + } + } } + count--; + prop.put("folders_"+count+"_comma", ""); + count++; + prop.put("folders", count); } else if(displayBmk && isBookmark) { try { final String urlHash = post.get(ROOT).substring(2); diff --git a/htroot/api/ymarks/get_xbel.java b/htroot/api/ymarks/get_xbel.java index 79de80b34..d677ad0d6 100644 --- a/htroot/api/ymarks/get_xbel.java +++ b/htroot/api/ymarks/get_xbel.java @@ -85,12 +85,7 @@ public class get_xbel { prop.put("xbel_"+count+"_elements", "" + CharacterCoding.unicode2xml(foldername[n], true) + ""); count++; } - try { - bit = sb.tables.bookmarks.getBookmarksByFolder(bmk_user, folder); - } catch (final IOException e) { - // TODO: better error handling (avoid NPE) - bit = null; - } + bit = sb.tables.bookmarks.getBookmarksByFolder(bmk_user, folder); Tables.Row bmk_row = null; String urlHash; final YMarkDate date = new YMarkDate(); diff --git a/htroot/api/ymarks/get_ymark.java b/htroot/api/ymarks/get_ymark.java index acc8acc09..74922a461 100644 --- a/htroot/api/ymarks/get_ymark.java +++ b/htroot/api/ymarks/get_ymark.java @@ -13,6 +13,7 @@ import de.anomic.data.UserDB; import de.anomic.data.ymark.YMarkCrawlStart; import de.anomic.data.ymark.YMarkDate; import de.anomic.data.ymark.YMarkEntry; +import de.anomic.data.ymark.YMarkRDF; import de.anomic.data.ymark.YMarkTables; import de.anomic.data.ymark.YMarkTables.TABLES; import de.anomic.data.ymark.YMarkUtil; @@ -31,7 +32,7 @@ public class get_ymark { prop = new serverObjects(); int rp; // items per page - int page; // page + int page; // page int total; String sortorder; String sortname; @@ -68,8 +69,8 @@ public class get_ymark { if(!query.isEmpty()) { if(!qtype.isEmpty()) { if(qtype.equals("_tags")) { - final String[] tagArray = YMarkUtil.cleanTagsString(query).split(YMarkUtil.TAGS_SEPARATOR); - result = sb.tables.bookmarks.orderBookmarksBy(sb.tables.bookmarks.getBookmarksByTag(bmk_user, tagArray), sortname, sortorder); + final String tags = YMarkUtil.cleanTagsString(query); + result = sb.tables.bookmarks.orderBookmarksBy(sb.tables.bookmarks.getBookmarksByTag(bmk_user, tags), sortname, sortorder); } else if(qtype.equals("_folder")) { result = sb.tables.bookmarks.orderBookmarksBy(sb.tables.bookmarks.getBookmarksByFolder(bmk_user, query), sortname, sortorder); } else { @@ -89,7 +90,6 @@ public class get_ymark { prop.put("page", page); prop.put("total", total); putProp(bookmarks, rp, page); - } else { prop.put(serverObjects.ACTION_AUTHENTICATE, YMarkTables.USER_AUTHENTICATE_MSG); } diff --git a/htroot/api/ymarks/import_ymark.java b/htroot/api/ymarks/import_ymark.java index 8fdeaad56..35310ffa4 100644 --- a/htroot/api/ymarks/import_ymark.java +++ b/htroot/api/ymarks/import_ymark.java @@ -1,33 +1,36 @@ +import java.io.BufferedReader; import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; -import java.util.Date; import java.util.Iterator; import java.util.concurrent.ArrayBlockingQueue; import java.util.regex.Pattern; +import java.util.zip.GZIPInputStream; import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.RequestHeader; -import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.util.SpaceExceededException; import net.yacy.document.Parser.Failure; import net.yacy.document.content.SurrogateReader; import net.yacy.kelondro.blob.Tables; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.workflow.InstantBusyThread; import net.yacy.search.Switchboard; import org.xml.sax.SAXException; -import de.anomic.crawler.CrawlProfile; -import de.anomic.crawler.CrawlSwitchboard; -import de.anomic.crawler.retrieval.Request; import de.anomic.data.BookmarksDB; import de.anomic.data.UserDB; import de.anomic.data.WorkTables; +import de.anomic.data.ymark.MonitoredReader; import de.anomic.data.ymark.YMarkAutoTagger; +import de.anomic.data.ymark.YMarkCrawlStart; +import de.anomic.data.ymark.YMarkDMOZImporter; import de.anomic.data.ymark.YMarkEntry; import de.anomic.data.ymark.YMarkHTMLImporter; import de.anomic.data.ymark.YMarkJSONImporter; @@ -48,7 +51,6 @@ public class import_ymark { final boolean isAuthUser = user!= null && user.hasRight(UserDB.AccessRight.BOOKMARK_RIGHT); final int queueSize = 200; - Thread t; YMarkEntry bmk; // String root = YMarkEntry.FOLDERS_IMPORTED; String root = ""; @@ -71,8 +73,8 @@ public class import_ymark { if(post.get("autotag").equals("empty")) { empty = true; } - t = new Thread(new YMarkAutoTagger(autoTaggingQueue, sb.loader, sb.tables.bookmarks, bmk_user, merge),"YMarks - autoTagger"); - t.start(); + YMarkAutoTagger autoTagger = new YMarkAutoTagger(autoTaggingQueue, sb.loader, sb.tables.bookmarks, bmk_user, merge); + InstantBusyThread.oneTimeJob(autoTagger, 0); } if(isAdmin && post.containsKey("table") && post.get("table").length() > 0) { @@ -86,7 +88,8 @@ public class import_ymark { root = post.get("root"); } if(post.containsKey("bmkfile") && !post.get("bmkfile").isEmpty() && post.containsKey("importer")){ - stream = new ByteArrayInputStream(UTF8.getBytes(post.get("bmkfile$file"))); + final byte[] bytes = UTF8.getBytes(post.get("bmkfile$file")); + stream = new ByteArrayInputStream(bytes); if(post.get("importer").equals("surro") && stream != null) { SurrogateReader surrogateReader; try { @@ -97,16 +100,15 @@ public class import_ymark { prop.put("status", "0"); return prop; } - t = new Thread(surrogateReader, "YMarks - Surrogate Reader"); - t.start(); + InstantBusyThread.oneTimeJob(surrogateReader, 0); while ((bmk = new YMarkEntry(surrogateReader.take())) != YMarkEntry.POISON) { putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink); } prop.put("status", "1"); } else { - InputStreamReader reader = null; + MonitoredReader reader = null; try { - reader = new InputStreamReader(stream,"UTF-8"); + reader = new MonitoredReader(new InputStreamReader(stream,"UTF-8"), 1024*16, bytes.length); } catch (final UnsupportedEncodingException e1) { //TODO: display an error message Log.logException(e1); @@ -115,11 +117,8 @@ public class import_ymark { } if(post.get("importer").equals("html") && reader != null) { final YMarkHTMLImporter htmlImporter = new YMarkHTMLImporter(reader, queueSize, root); - t = new Thread(htmlImporter, "YMarks - HTML Importer"); - t.start(); - while ((bmk = htmlImporter.take()) != YMarkEntry.POISON) { - putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink); - } + InstantBusyThread.oneTimeJob(htmlImporter, 0); + InstantBusyThread.oneTimeJob(htmlImporter.getConsumer(sb, bmk_user, autoTaggingQueue, autotag, empty, indexing, medialink), 0); prop.put("status", "1"); } else if(post.get("importer").equals("xbel") && reader != null) { final YMarkXBELImporter xbelImporter; @@ -132,17 +131,13 @@ public class import_ymark { prop.put("status", "0"); return prop; } - t = new Thread(xbelImporter, "YMarks - XBEL Importer"); - t.start(); - while ((bmk = xbelImporter.take()) != YMarkEntry.POISON) { - putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink); - } + InstantBusyThread.oneTimeJob(xbelImporter, 0); + InstantBusyThread.oneTimeJob(xbelImporter.getConsumer(sb, bmk_user, autoTaggingQueue, autotag, empty, indexing, medialink), 0); prop.put("status", "1"); } else if(post.get("importer").equals("json") && reader != null) { YMarkJSONImporter jsonImporter; jsonImporter = new YMarkJSONImporter(reader, queueSize, root); - t = new Thread(jsonImporter, "YMarks - JSON Importer"); - t.start(); + InstantBusyThread.oneTimeJob(jsonImporter, 0); while ((bmk = jsonImporter.take()) != YMarkEntry.POISON) { putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink); } @@ -167,14 +162,11 @@ public class import_ymark { } prop.put("status", "1"); } catch (final IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (final SpaceExceededException e) { - // TODO Auto-generated catch block - e.printStackTrace(); + Log.logException(e); } catch (final Failure e) { - // TODO Auto-generated catch block - e.printStackTrace(); + Log.logException(e); + } catch (SpaceExceededException e) { + Log.logException(e); } } else if(post.containsKey("importer") && post.get("importer").equals("bmks")) { if(!isAdmin) { @@ -201,32 +193,46 @@ public class import_ymark { sb.tables.bookmarks.addBookmark(bmk_user, bmk_entry, merge, true); prop.put("status", "1"); } catch (final MalformedURLException e) { - // TODO Auto-generated catch block - e.printStackTrace(); + Log.logException(e); } catch (final IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (final SpaceExceededException e) { - // TODO Auto-generated catch block - e.printStackTrace(); + Log.logException(e); + } catch (SpaceExceededException e) { + Log.logException(e); } } + } else if(post.containsKey("importer") && post.get("importer").equals("dmoz")) { + if(!isAdmin) { + prop.authenticationRequired(); + return prop; + } + try { + final File in = new File(sb.workPath, "content.rdf.u8.gz"); + final InputStream gzip = new FileInputStream(in); + final InputStream content = new GZIPInputStream(gzip); + final InputStreamReader reader = new InputStreamReader(content, "UTF-8"); + final BufferedReader breader = new BufferedReader(reader); + final MonitoredReader mreader = new MonitoredReader(breader, 1024*1024, in.length()); + + final String source = post.get("source", ""); + final YMarkDMOZImporter DMOZImporter = new YMarkDMOZImporter(mreader, queueSize, root, source); + + mreader.addChangeListener(sb.tables.bookmarks.getProgressListener("DMOZImporter")); + DMOZImporter.setDepth(6); + InstantBusyThread.oneTimeJob(DMOZImporter, 0); + InstantBusyThread.oneTimeJob(DMOZImporter.getConsumer(sb, bmk_user, autoTaggingQueue, autotag, empty, indexing, medialink), 0); + + prop.put("status", "1"); + } catch (Exception e) { + Log.logException(e); + } } - if(post.containsKey("autotag") && !post.get("autotag", "off").equals("off")) { - try { - autoTaggingQueue.put(YMarkAutoTagger.POISON); - Log.logInfo(YMarkTables.BOOKMARKS_LOG, "Importer inserted poison pill in autoTagging queue"); - } catch (final InterruptedException e) { - Log.logException(e); - } - } } else { prop.put(serverObjects.ACTION_AUTHENTICATE, YMarkTables.USER_AUTHENTICATE_MSG); } // return rewrite properties return prop; } - + public static void putBookmark(final Switchboard sb, final String bmk_user, final YMarkEntry bmk, final ArrayBlockingQueue autoTaggingQueue, final boolean autotag, final boolean empty, final String indexing, final boolean medialink) { try { @@ -240,62 +246,22 @@ public class import_ymark { } else if(!bmk.containsKey(YMarkEntry.BOOKMARK.TAGS.key()) || bmk.get(YMarkEntry.BOOKMARK.TAGS.key()).equals(YMarkEntry.BOOKMARK.TAGS.deflt())) { autoTaggingQueue.put(url); } - } - + } // fill crawler if (indexing.equals("single")) { - crawlStart(sb, new DigestURI(url), CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, 0, true, medialink); + bmk.crawl(YMarkCrawlStart.CRAWLSTART.SINGLE, medialink, sb); } else if (indexing.equals("onelink")) { - crawlStart(sb, new DigestURI(url), CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, 1, true, medialink); + bmk.crawl(YMarkCrawlStart.CRAWLSTART.ONE_LINK, medialink, sb); } else if (indexing.equals("fulldomain")) { - final DigestURI u = new DigestURI(url); - crawlStart(sb, u, CrawlProfile.mustMatchFilterFullDomain(u), CrawlProfile.MATCH_NEVER_STRING, 99, false, medialink); + bmk.crawl(YMarkCrawlStart.CRAWLSTART.FULL_DOMAIN, medialink, sb); } } } catch (final IOException e) { Log.logException(e); - } catch (final SpaceExceededException e) { - Log.logException(e); } catch (final InterruptedException e) { Log.logException(e); + } catch (SpaceExceededException e) { + Log.logException(e); } } - - public static String crawlStart( - final Switchboard sb, - final DigestURI startURL, - final String urlMustMatch, - final String urlMustNotMatch, - final int depth, - final boolean crawlingQ, final boolean medialink) { - final CrawlProfile pe = new CrawlProfile( - (startURL.getHost() == null) ? startURL.toNormalform(true, false) : startURL.getHost(), null, - urlMustMatch, - urlMustNotMatch, - CrawlProfile.MATCH_ALL_STRING, - CrawlProfile.MATCH_NEVER_STRING, - "", depth, medialink, - CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, crawlingQ, - true, true, true, false, true, true, true, - CacheStrategy.IFFRESH); - sb.crawler.putActive(pe.handle().getBytes(), pe); - return sb.crawlStacker.stackCrawl(new Request( - sb.peers.mySeed().hash.getBytes(), - startURL, - null, - "CRAWLING-ROOT", - new Date(), - pe.handle(), 0, 0, 0, 0 - )); - } - -} - - - - - - - - - +} \ No newline at end of file diff --git a/htroot/api/ymarks/manage_tags.java b/htroot/api/ymarks/manage_tags.java index c01e0dd22..03289a0a4 100644 --- a/htroot/api/ymarks/manage_tags.java +++ b/htroot/api/ymarks/manage_tags.java @@ -56,8 +56,8 @@ public class manage_tags { if(qtype.equals("_tags")) { if(query.isEmpty()) query = tags; - final String[] tagArray = YMarkUtil.cleanTagsString(query).split(YMarkUtil.TAGS_SEPARATOR); - row_iter = sb.tables.bookmarks.getBookmarksByTag(bmk_user, tagArray); + final String tagsString = YMarkUtil.cleanTagsString(query); + row_iter = sb.tables.bookmarks.getBookmarksByTag(bmk_user, tagsString); } else if(qtype.equals("_folder")) { row_iter = sb.tables.bookmarks.getBookmarksByFolder(bmk_user, query); } else { @@ -67,8 +67,8 @@ public class manage_tags { row_iter = sb.tables.iterator(bmk_table, Pattern.compile(query)); } } else { - final String[] tagArray = YMarkUtil.cleanTagsString(tags).split(YMarkUtil.TAGS_SEPARATOR); - row_iter = sb.tables.bookmarks.getBookmarksByTag(bmk_user, tagArray); + final String tagsString = YMarkUtil.cleanTagsString(tags); + row_iter = sb.tables.bookmarks.getBookmarksByTag(bmk_user, tagsString); // row_iter = sb.tables.iterator(bmk_table); } sb.tables.bookmarks.replaceTags(row_iter, bmk_user, tags, replace); diff --git a/htroot/js/yacy-ymarks-bookmark-actions.js b/htroot/js/yacy-ymarks-bookmark-actions.js index ed0e12d9d..e14a78f40 100644 --- a/htroot/js/yacy-ymarks-bookmark-actions.js +++ b/htroot/js/yacy-ymarks-bookmark-actions.js @@ -62,7 +62,7 @@ function bm_action(com,grid) { $("#bm_desc").setValue($('.trSelected',grid).find('p.desc').text().trim()); $('#bm_tags').importTags($('.trSelected',grid).find('p.tags').text().trim().replace(/,\s/g,",")); /* $("#bm_tags").setValue($('.trSelected',grid).find('p.tags').text().trim().replace(/,\s/g,",")); */ - $("#bm_path").setValue($('.trSelected',grid).find('p.folders').text().replace(/,\s/g,",")); + $("#bm_path").setValue($('.trSelected',grid).find('p.folders').text().replace(/, \s/g,",")); $("#bm_public").setValue($('.trSelected',grid).find('img').attr('alt')); $("#ymarks_add_dialog").dialog('open'); } else if (com=='Crawl') { diff --git a/htroot/js/yacy-ymarks.js b/htroot/js/yacy-ymarks.js index a1ad65d1e..5a0c3271d 100644 --- a/htroot/js/yacy-ymarks.js +++ b/htroot/js/yacy-ymarks.js @@ -8,7 +8,7 @@ $(document).ready(function() { /* Initialize Bookmark Dialog */ bm_dialog(); - + /* Initialize Flexigrid */ $('#ymarks_flexigrid').flexigrid({ url: '/api/ymarks/get_ymark.json', @@ -103,10 +103,22 @@ $(document).ready(function() { $("input[name='root']").attr("disabled","disabled"); } else if ($("input[name=importer]:checked").val() == 'bmks') { $("input[name='bmkfile']").attr("disabled","disabled"); + } else if ($("input[name=importer]:checked").val() == 'dmoz') { + $("input[name='bmkfile']").attr("disabled","disabled"); + $("input[name='root']").setValue("/DMOZ"); + $("input[name='source']").removeAttr("disabled"); + $("input[name='source']").setValue("Top/"); + alert("The DMOZ RDF dump is exspected on your YaCy peer at DATA/WORK/content.rdf.u8.gz" + + "\nYou can download the file from http://rdf.dmoz.org/rdf/content.rdf.u8.gz (ca. 320 MB)." + + "\n\nPlease check http://www.dmoz.org/license.html before you import any DMOZ data into YaCy!" + + "\n\nDue to the large number of links contained in the dmoz file it is recommended" + + "\nto limit the import volume with an appropriate value for the source folder (e.g. Top/Games).") } else { $("input[name='bmkfile']").removeAttr("disabled"); $("input[name='root']").removeAttr("disabled"); $("input[name='root']").setValue("/Imported Bookmarks"); + $("input[name='source']").attr("disabled","disabled"); + $("input[name='source']").setValue(""); } }); @@ -155,6 +167,38 @@ $(document).ready(function() { minWidth: 200, maxWidth: 200, header: "", + multiple: false, + selectedList: 1 + }); + + $("#ymarks_importer").multiselect({ + noneSelectedText: "Select an Importer ...", + minWidth: 200, + maxWidth: 200, + header: "", + multiple: false, + selectedList: 1 + }); + + $("#ymarks_autotag").multiselect({ + noneSelectedText: "Select an option ...", + minWidth: 200, + maxWidth: 200, + header: "", + multiple: false, + selectedList: 1 + }); + + $("#ymarks_indexing").multiselect({ + position: { + my: 'left bottom', + at: 'left top' + }, + noneSelectedText: "Select an option ...", + minWidth: 200, + maxWidth: 200, + header: "", + multiple: false, selectedList: 1 }); @@ -254,12 +298,12 @@ function loadTagCloud() { }; function loadTreeView() { - $("#ymarks_treeview").empty(); + $("#ymarks_treeview").empty(); $("#ymarks_treeview").treeview({ url: "/api/ymarks/get_treeview.json?bmtype=href", - unique: true, + unique: false, persist: "location" - }); + }); $("#ymarks_treeview").bind("click", function(event) { if ($(event.target).is("li") || $(event.target).parents("li").length) { @@ -270,7 +314,8 @@ function loadTreeView() { newp: 1 }); $('#ymarks_flexigrid').flexReload(); - return false; - } + } + return false; }); + return false; } diff --git a/source/de/anomic/data/ymark/MonitoredReader.java b/source/de/anomic/data/ymark/MonitoredReader.java new file mode 100644 index 000000000..feabd78d0 --- /dev/null +++ b/source/de/anomic/data/ymark/MonitoredReader.java @@ -0,0 +1,102 @@ +package de.anomic.data.ymark; + +import java.io.FilterReader; +import java.io.IOException; +import java.io.Reader; +import java.nio.CharBuffer; + +import javax.swing.event.ChangeEvent; +import javax.swing.event.ChangeListener; + +/** + * This class monitors the read progress + * + */ +public class MonitoredReader extends FilterReader { + private volatile long mark = 0; + private volatile long location = 0; + private final int threshold; + private final long maxProgress; + private long lastTriggeredLocation = 0; + private ChangeListener listener = null; + + public MonitoredReader(Reader in, int threshold, long maxProgress) { + super(in); + this.threshold = threshold; + this.maxProgress = maxProgress; + } + + public void addChangeListener(ChangeListener l) { + this.listener = l; + } + + protected void triggerChanged(final long location) { + if ( threshold > 0 && Math.abs( location-lastTriggeredLocation ) < threshold ) + return; + lastTriggeredLocation = location; + if (listener == null) + return; + listener.stateChanged(new ChangeEvent(this)); + } + + public long getProgress() { + return this.location; + } + + public long maxProgress() { + return this.maxProgress; + } + + @Override + public int read() throws IOException { + final int i = super.read(); + if ( i != -1 ) + triggerChanged(location++); + return i; + } + + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + final int i = super.read(cbuf, off, len); + if ( i != -1 ) + triggerChanged(location+=i); + return i; + } + + @Override + public int read(char[] cbuf) throws IOException { + final int i = super.read(cbuf); + if ( i != -1 ) + triggerChanged(location+=i); + return i; + } + + @Override + public int read(CharBuffer target) throws IOException { + final int i = super.read(target); + if ( i != -1 ) + triggerChanged(location+=i); + return i; + } + + @Override + public long skip(long n) throws IOException { + final long i = super.skip(n); + if ( i != -1 ) + triggerChanged(location+=i); + return i; + } + + @Override + public synchronized void mark(int readlimit) throws IOException { + super.mark(readlimit); + mark = location; + } + + @Override + public synchronized void reset() throws IOException { + super.reset(); + if ( location != mark ) + triggerChanged(location = mark); + } +} diff --git a/source/de/anomic/data/ymark/YMarkAutoTagger.java b/source/de/anomic/data/ymark/YMarkAutoTagger.java index 387b31b41..a8b635d90 100644 --- a/source/de/anomic/data/ymark/YMarkAutoTagger.java +++ b/source/de/anomic/data/ymark/YMarkAutoTagger.java @@ -59,7 +59,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle this.merge = true; } - private static Document loadDocument(final String url, final LoaderDispatcher loader) { + private static Document loadDocument(final String url, final LoaderDispatcher loader) throws IOException { DigestURI uri; Response response; try { @@ -68,12 +68,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle Log.logWarning(YMarkTables.BOOKMARKS_LOG, "loadDocument failed due to malformed url: "+url); return null; } - try { - response = loader.load(loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay); - } catch (final IOException e) { - Log.logWarning(YMarkTables.BOOKMARKS_LOG, "loadDocument failed due to IOException for url: "+url); - return null; - } + response = loader.load(loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay); try { return Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); } catch (final Failure e) { @@ -215,8 +210,18 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle } public static String autoTag(final String url, final LoaderDispatcher loader, final int max, final TreeMap tags) { - final Document document = loadDocument(url, loader); - return (document != null) ? autoTag(document, max, tags) : "/IOExceptions"; + Document document = null; + String exception = "/IOExceptions"; + try { + document = loadDocument(url, loader); + } catch (IOException e) { + exception = e.getMessage(); + int start = exception.indexOf('\'')+9; + int end = exception.indexOf('\'', start); + if(start >= 0 && end > 0 && start < exception.length() && end < exception.length()) + exception = "/IOExceptions/" + exception.substring(start, end); + } + return (document != null) ? autoTag(document, max, tags) : exception; } public static boolean isDigitSpace(String str) { @@ -234,17 +239,15 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle @Override public void run() { - Log.logInfo(YMarkTables.BOOKMARKS_LOG, "autoTagger run()"); Thread.currentThread().setUncaughtExceptionHandler(this); String url = null; String tagString; Iterator tit; try { final TreeMap tags = this.ymarks.getTags(this.bmk_user); - Log.logInfo(YMarkTables.BOOKMARKS_LOG, "autoTagger queue size: "+this.bmkQueue.size()); while((url = this.bmkQueue.take()) != POISON) { tagString = autoTag(url, this.loader, 5, tags); - if (tagString.equals("/IOExceptions")) { + if (tagString.startsWith("/IOExceptions")) { this.ymarks.addFolder(this.bmk_user, url, tagString); tagString = ""; } @@ -262,7 +265,6 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle } } } - Log.logInfo(YMarkTables.BOOKMARKS_LOG, "autoTagger has been poisoned"); } catch (final InterruptedException e) { Log.logException(e); } catch (final IOException e) { diff --git a/source/de/anomic/data/ymark/YMarkCrawlStart.java b/source/de/anomic/data/ymark/YMarkCrawlStart.java index d0474b3a0..73f314ecf 100644 --- a/source/de/anomic/data/ymark/YMarkCrawlStart.java +++ b/source/de/anomic/data/ymark/YMarkCrawlStart.java @@ -1,6 +1,6 @@ // YMarkCrawlStart.java -// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany -// first published 2010 on http://yacy.net +// (C) 2012 by Stefan Förster, sof@gmx.de, Norderstedt, Germany +// first published 2011 on http://yacy.net // // This is a part of YaCy, a peer-to-peer based web search engine // @@ -33,13 +33,17 @@ import java.util.Iterator; import java.util.regex.Pattern; import net.yacy.cora.document.UTF8; +import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.kelondro.blob.Tables; +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.search.Switchboard; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlSwitchboard; +import de.anomic.crawler.retrieval.Request; import de.anomic.data.WorkTables; public class YMarkCrawlStart extends HashMap{ - + private static final long serialVersionUID = 1L; private final WorkTables worktables; private Date date_last_exec; @@ -47,7 +51,11 @@ public class YMarkCrawlStart extends HashMap{ private Date date_recording; private String apicall_pk; private String url; - + + public static enum CRAWLSTART { + SINGLE, ONE_LINK, FULL_DOMAIN + } + public YMarkCrawlStart(final WorkTables worktables) { super(); this.date_recording = new Date(0); @@ -62,41 +70,44 @@ public class YMarkCrawlStart extends HashMap{ this.clear(); this.load(); } - + public String getPK() { if(this.isEmpty()) return ""; return this.apicall_pk; } - + public Date date_last_exec() { if(this.isEmpty()) return new Date(0); return this.date_last_exec; } - + public Date date_next_exec() { if(this.isEmpty()) return new Date(0); return this.date_next_exec; } - - public boolean hasSchedule() { - return !this.isEmpty() && this.date_next_exec.after(new Date()); + + public boolean hasSchedule() { + if(!this.isEmpty() && this.date_next_exec.after(new Date())) + return true; + else + return false; } - + public boolean isRunning(final CrawlSwitchboard crawler) { final Iterator iter = crawler.getActive().iterator(); while(iter.hasNext()) { final byte[] key = iter.next(); final CrawlProfile crawl = crawler.getActive(key); if (crawl.startURL().equals(this.url)) { - return true; + return true; } } return false; } - + public Date date_recording() { return this.date_recording; } @@ -108,16 +119,16 @@ public class YMarkCrawlStart extends HashMap{ this.load(); } } - + public int exec(final String host, final int port, final String realm) { return this.worktables.execAPICall(this.apicall_pk, host, port, realm); } - + private void load() { try { final StringBuilder buffer = new StringBuilder(500); buffer.append("^crawl start for "); - buffer.append(Pattern.quote(this.url)); + buffer.append(Pattern.quote(url)); buffer.append("?.*"); final Pattern pattern = Pattern.compile(buffer.toString()); //final Iterator APIcalls = this.worktables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_URL, pattern); @@ -126,7 +137,7 @@ public class YMarkCrawlStart extends HashMap{ while(APIcalls.hasNext()) { row = APIcalls.next(); if(row.get(WorkTables.TABLE_API_COL_TYPE, "").equals("crawler")) { - Date date = row.get(WorkTables.TABLE_API_COL_DATE_RECORDING, row.get(WorkTables.TABLE_API_COL_DATE, new Date())); + Date date = row.get(WorkTables.TABLE_API_COL_DATE_RECORDING, row.get(WorkTables.TABLE_API_COL_DATE, new Date())); if(date.after(this.date_recording)) { this.clear(); this.apicall_pk = UTF8.String(row.getPK()); @@ -158,4 +169,30 @@ public class YMarkCrawlStart extends HashMap{ // TODO Auto-generated catch block } } + + public static String crawlStart( + final Switchboard sb, + final DigestURI startURL, + final String urlMustMatch, + final String urlMustNotMatch, + final int depth, + final boolean crawlingQ, final boolean medialink) { + final CrawlProfile pe = new CrawlProfile( + (startURL.getHost() == null) ? startURL.toNormalform(true, false) : startURL.getHost(), null, + urlMustMatch, + urlMustNotMatch, + CrawlProfile.MATCH_ALL_STRING, + CrawlProfile.MATCH_NEVER_STRING, + "", depth, medialink, + CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, crawlingQ, true, true, true, false, true, true, true,CacheStrategy.IFFRESH); + sb.crawler.putActive(pe.handle().getBytes(), pe); + return sb.crawlStacker.stackCrawl(new Request( + sb.peers.mySeed().hash.getBytes(), + startURL, + null, + "CRAWLING-ROOT", + new Date(), + pe.handle(), 0, 0, 0, 0 + )); + } } diff --git a/source/de/anomic/data/ymark/YMarkDMOZImporter.java b/source/de/anomic/data/ymark/YMarkDMOZImporter.java new file mode 100644 index 000000000..2e0d82dd0 --- /dev/null +++ b/source/de/anomic/data/ymark/YMarkDMOZImporter.java @@ -0,0 +1,152 @@ +// YMarkDMOZImporter.java +// (C) 2012 by Stefan Foerster (apfelmaennchen), sof@gmx.de, Norderstedt, Germany +// first published 2012 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.data.ymark; + +import net.yacy.cora.lod.vocabulary.DMOZ; +import net.yacy.cora.lod.vocabulary.DublinCore; + +import org.xml.sax.Attributes; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; +import org.xml.sax.helpers.DefaultHandler; +import org.xml.sax.helpers.XMLReaderFactory; + +public class YMarkDMOZImporter extends YMarkImporter { + // Statics + public static String IMPORTER = "DMOZ"; + + // Importer Variables + private final XMLReader xmlReader; + private int depth; + + public YMarkDMOZImporter(final MonitoredReader dmoz_file, final int queueSize, final String targetFolder, final String sourceFolder) throws SAXException { + super(dmoz_file, queueSize, sourceFolder, targetFolder); + setImporter(IMPORTER); + this.xmlReader = XMLReaderFactory.createXMLReader(); + this.xmlReader.setFeature(XML_NAMESPACE_PREFIXES, false); + this.xmlReader.setFeature(XML_NAMESPACES, false); + this.xmlReader.setFeature(XML_VALIDATION, false); + this.xmlReader.setContentHandler(new DMOZParser()); + this.depth = Integer.MAX_VALUE; + } + + public void parse() throws Exception { + xmlReader.parse(new InputSource(bmk_file)); + } + + public void setDepth(int d) { + this.depth = d + this.targetFolder.split(YMarkUtil.FOLDERS_SEPARATOR).length-1; + } + + public class DMOZParser extends DefaultHandler { + + private YMarkEntry bmk; + private boolean isNewEntry; + private boolean isSubtopic; + private String tag; + private final StringBuilder buffer; + + public DMOZParser() { + this.bmk = new YMarkEntry(); + this.isNewEntry = false; + this.isSubtopic = false; + this.buffer = new StringBuilder(512); + } + + public void startElement(final String uri, String localName, final String qName, final Attributes attributes) throws SAXException { + // get rid of namespace prefixes + if (localName.isEmpty()) { + localName = qName.substring(qName.indexOf(':')+1); + } + this.tag = null; + if (localName.equals(DMOZ.ExternalPage.name())) { + this.bmk = new YMarkEntry(); + this.bmk.put(YMarkEntry.BOOKMARK.URL.key(), attributes.getValue(0)); + this.isNewEntry = true; + } + if(isNewEntry && localName.equals(DublinCore.Title.name())) { + this.tag = YMarkEntry.BOOKMARK.TITLE.key(); + } + if(isNewEntry && localName.equals(DublinCore.Description.name())) { + this.tag = YMarkEntry.BOOKMARK.DESC.key(); + } + if(isNewEntry && localName.equals(DMOZ.topic.name())) { + this.tag = YMarkEntry.BOOKMARK.FOLDERS.key(); + buffer.append(targetFolder); + buffer.append(YMarkUtil.FOLDERS_SEPARATOR); + } + } + + public void endElement(final String uri, String localName, final String qName) throws SAXException { + // get rid of namespace prefixes + if (localName.isEmpty()) { + localName = qName.substring(qName.indexOf(':')+1); + } + if (this.isNewEntry && this.isSubtopic && localName.equals(DMOZ.ExternalPage.name())) { + try { + bookmarks.put(this.bmk); + } catch (InterruptedException e) { + e.printStackTrace(); + } finally { + this.isSubtopic = false; + this.isNewEntry = false; + } + } else if(localName.equals(DMOZ.topic.name())) { + int d = 0; + for(int i=0; i depth) { + this.buffer.setLength(i); + break; + } + } + } + if (this.buffer.substring(targetFolder.length()+1).startsWith(sourceFolder)) { + this.isSubtopic = true; + this.bmk.put(this.tag, YMarkUtil.cleanFoldersString(buffer)); + } else { + this.isSubtopic = false; + this.isNewEntry = false; + } + } else if (this.tag != null) { + this.bmk.put(this.tag, buffer.toString()); + } + this.tag = null; + this.buffer.setLength(0); + } + + public void characters(final char ch[], final int start, final int length) throws SAXException { + // no processing here, as the SAX Parser characters method could be called more than once per tag! + if(this.tag != null) { + buffer.append(ch, start, length); + } + } + } +} + diff --git a/source/de/anomic/data/ymark/YMarkEntry.java b/source/de/anomic/data/ymark/YMarkEntry.java index bd73bc800..f44f25032 100644 --- a/source/de/anomic/data/ymark/YMarkEntry.java +++ b/source/de/anomic/data/ymark/YMarkEntry.java @@ -1,5 +1,33 @@ +// YMarkEntry.java +// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany +// first published 2011 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + package de.anomic.data.ymark; +import java.net.MalformedURLException; +import java.util.Collections; import java.util.EnumSet; import java.util.HashMap; import java.util.Map; @@ -7,6 +35,10 @@ import java.util.TreeMap; import net.yacy.document.content.DCEntry; import net.yacy.kelondro.blob.Tables; +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.logging.Log; +import net.yacy.search.Switchboard; +import de.anomic.crawler.CrawlProfile; public class YMarkEntry extends TreeMap { @@ -16,19 +48,19 @@ public class YMarkEntry extends TreeMap { public static final String BOOKMARKS_ID = "id"; public static final String BOOKMARKS_REF = "ref"; public static final String FOLDERS_IMPORTED = "/imported"; - + public static enum BOOKMARK { - // key dc_attrb dflt html_attrb xbel_attrb json_attrb type - URL ("url", "dc:identifier", "", "href", "href", "uri", "link"), - TITLE ("title", "dc:title", "", "", "", "title", "meta"), - DESC ("desc", "dc:description", "", "", "", "", "comment"), - DATE_ADDED ("date_added", "", "", "add_date", "added", "dateAdded", "date"), - DATE_MODIFIED ("date_modified", "", "", "last_modified", "modified", "lastModified", "date"), - DATE_VISITED ("date_visited", "", "", "last_visited", "visited", "", "date"), - PUBLIC ("public", "", "false", "private", "yacy:public", "", "lock"), - TAGS ("tags", "dc:subject", "unsorted", "shortcuturl", "yacy:tags", "keyword", "tag"), - VISITS ("visits", "", "0", "", "yacy:visits", "", "stat"), - FOLDERS ("folders", "", "/unsorted", "", "", "", "folder"); + // key dc_attrb dflt html_attrb xbel_attrb json_attrb type index separator + URL ("url", "dc:identifier", "", "href", "href", "uri", "link", false, YMarkUtil.EMPTY_STRING), + TITLE ("title", "dc:title", "", "", "", "title", "meta", false, YMarkUtil.EMPTY_STRING), + DESC ("desc", "dc:description", "", "", "", "", "comment", false, YMarkUtil.EMPTY_STRING), + DATE_ADDED ("date_added", "", "", "add_date", "added", "dateAdded", "date", false, YMarkUtil.EMPTY_STRING), + DATE_MODIFIED ("date_modified", "", "", "last_modified", "modified", "lastModified", "date", false, YMarkUtil.EMPTY_STRING), + DATE_VISITED ("date_visited", "", "", "last_visited", "visited", "", "date", false, YMarkUtil.EMPTY_STRING), + PUBLIC ("public", "", "false", "private", "yacy:public", "", "lock", false, YMarkUtil.EMPTY_STRING), + TAGS ("tags", "dc:subject", "unsorted", "shortcuturl", "yacy:tags", "keyword", "tag", true, YMarkUtil.TAGS_SEPARATOR), + VISITS ("visits", "", "0", "", "yacy:visits", "", "stat", false, YMarkUtil.EMPTY_STRING), + FOLDERS ("folders", "", "/unsorted", "", "", "", "folder", true, YMarkUtil.TAGS_SEPARATOR); private String key; private String dc_attrb; @@ -37,16 +69,23 @@ public class YMarkEntry extends TreeMap { private String xbel_attrb; private String json_attrb; private String type; + private boolean index; + private String seperator; private static final Map lookup = new HashMap(); + private static final Map indexColumns = new HashMap(); static { - for(BOOKMARK b : EnumSet.allOf(BOOKMARK.class)) - lookup.put(b.key(), b); + for(BOOKMARK b : EnumSet.allOf(BOOKMARK.class)) { + lookup.put(b.key, b); + if(b.index) { + indexColumns.put(b.key, b.seperator); + } + } } private static StringBuilder buffer = new StringBuilder(25); - private BOOKMARK(final String k, final String d, final String s, final String a, final String x, final String j, final String t) { + private BOOKMARK(final String k, final String d, final String s, final String a, final String x, final String j, final String t, final boolean index, final String separator) { this.key = k; this.dc_attrb = d; this.dflt = s; @@ -54,6 +93,11 @@ public class YMarkEntry extends TreeMap { this.xbel_attrb = x; this.json_attrb = j; this.type = t; + this.index = index; + this.seperator = separator; + } + public static Map indexColumns() { + return Collections.unmodifiableMap(indexColumns); } public static BOOKMARK get(String key) { return lookup.get(key); @@ -92,14 +136,21 @@ public class YMarkEntry extends TreeMap { public String type() { return this.type; } + public boolean index() { + return this.index; + } + public String seperator() { + return this.seperator; + } + } - + public YMarkEntry() { this(true); } public YMarkEntry(final boolean setDefaults) { - super(); + super(); if(setDefaults) { setCurrentTimeMillis(BOOKMARK.DATE_ADDED); setCurrentTimeMillis(BOOKMARK.DATE_MODIFIED); @@ -108,7 +159,8 @@ public class YMarkEntry extends TreeMap { } public YMarkEntry(final DCEntry dc) { - for (BOOKMARK b : BOOKMARK.values()) { + super(); + for (BOOKMARK b : BOOKMARK.values()) { if(dc.containsKey(b.dc_attrb)) { this.put(b.key(), dc.get(b.dc_attrb)); } @@ -119,7 +171,8 @@ public class YMarkEntry extends TreeMap { } public YMarkEntry(final Tables.Row bmk_row) { - for (BOOKMARK b : BOOKMARK.values()) { + super(); + for (BOOKMARK b : BOOKMARK.values()) { if(bmk_row.containsKey(b.key())) { this.put(b.key(), bmk_row.get(b.key(), b.deflt())); } @@ -146,6 +199,16 @@ public class YMarkEntry extends TreeMap { } } + public byte[] getUrlHash() { + if(this.containsKey(YMarkEntry.BOOKMARK.URL.key())) + try { + return YMarkUtil.getBookmarkId(this.get(YMarkEntry.BOOKMARK.URL.key())); + } catch (MalformedURLException e) { + Log.logWarning(YMarkTables.BOOKMARKS_LOG, "getUrlHash - MalformedURLException for YMarkEntry: "+this.get(YMarkEntry.BOOKMARK.URL.key())); + } + return null; + } + public DCEntry getDCEntry() { final DCEntry dc = new DCEntry(); for (BOOKMARK b : BOOKMARK.values()) { @@ -167,4 +230,21 @@ public class YMarkEntry extends TreeMap { } return data; } + + public void crawl(final YMarkCrawlStart.CRAWLSTART type, final boolean medialink, final Switchboard sb) throws MalformedURLException { + final DigestURI url = new DigestURI(this.get(BOOKMARK.URL.key())); + switch(type) { + case SINGLE: + YMarkCrawlStart.crawlStart(sb, url, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, 0, true, medialink); + break; + case ONE_LINK: + YMarkCrawlStart.crawlStart(sb, url, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, 1, true, medialink); + break; + case FULL_DOMAIN: + YMarkCrawlStart.crawlStart(sb, url, CrawlProfile.mustMatchFilterFullDomain(url), CrawlProfile.MATCH_NEVER_STRING, 99, false, medialink); + break; + default: + break; + } + } } diff --git a/source/de/anomic/data/ymark/YMarkHTMLImporter.java b/source/de/anomic/data/ymark/YMarkHTMLImporter.java index 47a3ba1a9..f0d5a37b6 100644 --- a/source/de/anomic/data/ymark/YMarkHTMLImporter.java +++ b/source/de/anomic/data/ymark/YMarkHTMLImporter.java @@ -26,10 +26,6 @@ package de.anomic.data.ymark; -import java.io.IOException; -import java.io.Reader; -import java.util.concurrent.ArrayBlockingQueue; - import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLEditorKit; @@ -37,17 +33,13 @@ import javax.swing.text.html.parser.ParserDelegator; import net.yacy.kelondro.logging.Log; -public class YMarkHTMLImporter extends HTMLEditorKit.ParserCallback implements Runnable { - +public class YMarkHTMLImporter extends YMarkImporter { + // Importer Variables - private final ArrayBlockingQueue bookmarks; - private final Reader bmk_file; - private final String RootFolder; - private final StringBuilder folderstring; - private YMarkEntry bmk; private final ParserDelegator htmlParser; // Statics + public static String IMPORTER = "HTML"; public static enum STATE { NOTHING, BOOKMARK, @@ -56,153 +48,137 @@ public class YMarkHTMLImporter extends HTMLEditorKit.ParserCallback implements R FOLDER_DESC } public static final String MILLIS = "000"; - - // Parser variables - private STATE state; - private HTML.Tag prevTag; - - public YMarkHTMLImporter(final Reader bmk_file, final int queueSize, final String root) { - this.bookmarks = new ArrayBlockingQueue(queueSize); - this.bmk_file = bmk_file; - this.RootFolder = root; - this.folderstring = new StringBuilder(YMarkTables.BUFFER_LENGTH); - this.folderstring.append(this.RootFolder); - this.bmk = new YMarkEntry(); - - this.htmlParser = new ParserDelegator(); - - this.state = STATE.NOTHING; - this.prevTag = null; - } - public void run() { - try { - this.htmlParser.parse(this.bmk_file, this, true); - } catch (IOException e) { - Log.logException(e); - } finally { - try { - this.bookmarks.put(YMarkEntry.POISON); - } catch (InterruptedException e) { - Log.logException(e); - } - try { - this.bmk_file.close(); - } catch (IOException e) { - Log.logException(e); - } - } + public YMarkHTMLImporter(final MonitoredReader bmk_file, final int queueSize, final String targetFolder, final String sourceFolder) { + super(bmk_file, queueSize, targetFolder, sourceFolder); + setImporter(IMPORTER); + this.htmlParser = new ParserDelegator(); } - public void handleText(char[] data, int pos) { - switch (state) { - case NOTHING: - break; - case BOOKMARK: - this.bmk.put(YMarkEntry.BOOKMARK.TITLE.key(), new String(data)); - this.bmk.put(YMarkEntry.BOOKMARK.FOLDERS.key(), this.folderstring.toString()); - this.bmk.put(YMarkEntry.BOOKMARK.PUBLIC.key(), YMarkEntry.BOOKMARK.PUBLIC.deflt()); - this.bmk.put(YMarkEntry.BOOKMARK.VISITS.key(), YMarkEntry.BOOKMARK.VISITS.deflt()); - break; - case FOLDER: - this.folderstring.append(YMarkUtil.FOLDERS_SEPARATOR); - this.folderstring.append(data); - break; - case FOLDER_DESC: - Log.logInfo(YMarkTables.BOOKMARKS_LOG, "YMarksHTMLImporter - folder: "+this.folderstring+" desc: " + new String(data)); - break; - case BMK_DESC: - this.bmk.put(YMarkEntry.BOOKMARK.DESC.key(), new String(data)); - break; - default: - break; - } + public YMarkHTMLImporter (final MonitoredReader bmk_file, final int queueSize, final String targetFolder) { + this(bmk_file, queueSize, targetFolder, ""); + } + + public void parse() throws Exception { + htmlParser.parse(bmk_file, new HTMLParser(), true); } + + public class HTMLParser extends HTMLEditorKit.ParserCallback { + + private YMarkEntry bmk; + private final StringBuilder folderstring; + private STATE state; + private HTML.Tag prevTag; + + public HTMLParser() { + this.folderstring = new StringBuilder(YMarkTables.BUFFER_LENGTH); + this.folderstring.append(targetFolder); + this.bmk = new YMarkEntry(); + this.state = STATE.NOTHING; + this.prevTag = null; + } + + public void handleText(char[] data, int pos) { + switch (state) { + case NOTHING: + break; + case BOOKMARK: + this.bmk.put(YMarkEntry.BOOKMARK.TITLE.key(), new String(data)); + this.bmk.put(YMarkEntry.BOOKMARK.FOLDERS.key(), this.folderstring.toString()); + this.bmk.put(YMarkEntry.BOOKMARK.PUBLIC.key(), YMarkEntry.BOOKMARK.PUBLIC.deflt()); + this.bmk.put(YMarkEntry.BOOKMARK.VISITS.key(), YMarkEntry.BOOKMARK.VISITS.deflt()); + break; + case FOLDER: + this.folderstring.append(YMarkUtil.FOLDERS_SEPARATOR); + this.folderstring.append(data); + break; + case FOLDER_DESC: + Log.logInfo(YMarkTables.BOOKMARKS_LOG, "YMarksHTMLImporter - folder: "+this.folderstring+" desc: " + new String(data)); + break; + case BMK_DESC: + this.bmk.put(YMarkEntry.BOOKMARK.DESC.key(), new String(data)); + break; + default: + break; + } + } - public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { - if (t == HTML.Tag.A) { - if (!this.bmk.isEmpty()) { - try { - this.bookmarks.put(this.bmk); - bmk = new YMarkEntry(); - } catch (InterruptedException e) { - Log.logException(e); - } - } - final String url = (String)a.getAttribute(HTML.Attribute.HREF); - this.bmk.put(YMarkEntry.BOOKMARK.URL.key(), url); - final StringBuilder sb = new StringBuilder(255); - for (YMarkEntry.BOOKMARK bmk : YMarkEntry.BOOKMARK.values()) { - sb.setLength(0); - if (a.isDefined(bmk.html_attrb())) { - sb.append((String)a.getAttribute(bmk.html_attrb())); - Log.logInfo(YMarkTables.BOOKMARKS_LOG, bmk.key()+" : "+sb.toString()); + public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { + if (t == HTML.Tag.A) { + if (!this.bmk.isEmpty()) { + try { + bookmarks.put(this.bmk); + bmk = new YMarkEntry(); + } catch (InterruptedException e) { + Log.logException(e); + } } - switch(bmk) { - case TAGS: - // sb already contains the mozilla shortcuturl - // add delicious.com tags that are stored in the tags attribute - if (a.isDefined(YMarkEntry.BOOKMARK.TAGS.key())) { - sb.append(YMarkUtil.TAGS_SEPARATOR); - sb.append((String)a.getAttribute(YMarkEntry.BOOKMARK.TAGS.key())); - } - this.bmk.put(bmk.key(), YMarkUtil.cleanTagsString(sb.toString())); - break; - case PUBLIC: - // look for delicious.com private attribute - if(sb.toString().equals("0")) - this.bmk.put(bmk.key(), "true"); - break; - case DATE_ADDED: - case DATE_MODIFIED: - case DATE_VISITED: - sb.append(MILLIS); - this.bmk.put(bmk.key(), sb.toString()); - break; - default: - break; - } - } - state = STATE.BOOKMARK; - } else if (t == HTML.Tag.H3) { - state = STATE.FOLDER; - } else if (t == HTML.Tag.DD && this.prevTag == HTML.Tag.A) { - state = STATE.BMK_DESC; - } else { - state = STATE.NOTHING; - } - this.prevTag = t; - } + final String url = (String)a.getAttribute(HTML.Attribute.HREF); + this.bmk.put(YMarkEntry.BOOKMARK.URL.key(), url); + final StringBuilder sb = new StringBuilder(255); + for (YMarkEntry.BOOKMARK bmk : YMarkEntry.BOOKMARK.values()) { + sb.setLength(0); + if (a.isDefined(bmk.html_attrb())) { + sb.append((String)a.getAttribute(bmk.html_attrb())); + Log.logInfo(YMarkTables.BOOKMARKS_LOG, bmk.key()+" : "+sb.toString()); + } + switch(bmk) { + case TAGS: + // sb already contains the mozilla shortcuturl + // add delicious.com tags that are stored in the tags attribute + if (a.isDefined(YMarkEntry.BOOKMARK.TAGS.key())) { + sb.append(YMarkUtil.TAGS_SEPARATOR); + sb.append((String)a.getAttribute(YMarkEntry.BOOKMARK.TAGS.key())); + } + this.bmk.put(bmk.key(), YMarkUtil.cleanTagsString(sb.toString())); + break; + case PUBLIC: + // look for delicious.com private attribute + if(sb.toString().equals("0")) + this.bmk.put(bmk.key(), "true"); + break; + case DATE_ADDED: + case DATE_MODIFIED: + case DATE_VISITED: + sb.append(MILLIS); + this.bmk.put(bmk.key(), sb.toString()); + break; + default: + break; + } + } + state = STATE.BOOKMARK; + } else if (t == HTML.Tag.H3) { + state = STATE.FOLDER; + } else if (t == HTML.Tag.DD && this.prevTag == HTML.Tag.A) { + state = STATE.BMK_DESC; + } else { + state = STATE.NOTHING; + } + this.prevTag = t; + } - public void handleEndTag(HTML.Tag t, int pos) { - // write the last bookmark, as no more tags are following - if (t == HTML.Tag.HTML) { - if (!this.bmk.isEmpty()) { - try { - this.bookmarks.put(this.bmk); - } catch (InterruptedException e) { - Log.logException(e); + public void handleEndTag(HTML.Tag t, int pos) { + // write the last bookmark, as no more tags are following + if (t == HTML.Tag.HTML) { + if (!this.bmk.isEmpty()) { + try { + bookmarks.put(this.bmk); + } catch (InterruptedException e) { + Log.logException(e); + } } } + if (t == HTML.Tag.H3) { + state = STATE.FOLDER_DESC; + } else if (t == HTML.Tag.DL) { + //TODO: get rid of .toString.equals() + if(!this.folderstring.toString().equals(targetFolder)) { + folderstring.setLength(folderstring.lastIndexOf(YMarkUtil.FOLDERS_SEPARATOR)); + } + } else { + state = STATE.NOTHING; + } } - if (t == HTML.Tag.H3) { - state = STATE.FOLDER_DESC; - } else if (t == HTML.Tag.DL) { - //TODO: get rid of .toString.equals() - if(!this.folderstring.toString().equals(this.RootFolder)) { - folderstring.setLength(folderstring.lastIndexOf(YMarkUtil.FOLDERS_SEPARATOR)); - } - } else { - state = STATE.NOTHING; - } } - - public YMarkEntry take() { - try { - return this.bookmarks.take(); - } catch (InterruptedException e) { - Log.logException(e); - return null; - } - } } diff --git a/source/de/anomic/data/ymark/YMarkImporter.java b/source/de/anomic/data/ymark/YMarkImporter.java new file mode 100644 index 000000000..3f5933b5e --- /dev/null +++ b/source/de/anomic/data/ymark/YMarkImporter.java @@ -0,0 +1,157 @@ +// YMarkImporter.java +// (C) 2012 by Stefan Foerster (apfelmaennchen), sof@gmx.de, Norderstedt, Germany +// first published 2012 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.data.ymark; + +import java.io.IOException; +import java.util.concurrent.ArrayBlockingQueue; + +import net.yacy.cora.util.SpaceExceededException; +import net.yacy.kelondro.logging.Log; +import net.yacy.search.Switchboard; + +public abstract class YMarkImporter implements Runnable { + // Statics + public final static String XML_NAMESPACE_PREFIXES = "http://xml.org/sax/features/namespace-prefixes"; + public final static String XML_NAMESPACES = "http://xml.org/sax/features/namespaces"; + public final static String XML_VALIDATION = "http://xml.org/sax/features/validation"; + + protected String importer; + protected ArrayBlockingQueue bookmarks; + protected final MonitoredReader bmk_file; + protected final String targetFolder; + protected final String sourceFolder; + + public YMarkImporter(final MonitoredReader bmk_file, final int queueSize, final String sourceFolder, final String targetFolder) { + this.bookmarks = new ArrayBlockingQueue(queueSize); + this.bmk_file = bmk_file; + this.sourceFolder = YMarkUtil.cleanFoldersString(sourceFolder); + this.targetFolder = YMarkUtil.cleanFoldersString(targetFolder); + } + + public void run() { + try { + parse(); + } catch (Exception e) { + Log.logException(e); + } finally { + try { + Log.logInfo(YMarkTables.BOOKMARKS_LOG, this.importer+" Importer inserted poison pill in queue"); + this.bookmarks.put(YMarkEntry.POISON); + } catch (InterruptedException e1) { + Log.logException(e1); + } + } + } + + public YMarkEntry take() { + try { + return this.bookmarks.take(); + } catch (InterruptedException e) { + Log.logException(e); + return null; + } + } + + public void setImporter(final String importer) { + this.importer = importer; + } + + public long getProgress() { + return this.bmk_file.getProgress(); + } + + public long maxProgress() { + return this.bmk_file.maxProgress(); + } + + public abstract void parse() throws Exception; + + public Consumer getConsumer(final Switchboard sb, final String bmk_user, final ArrayBlockingQueue autoTaggingQueue, + final boolean autotag, final boolean empty, final String indexing, final boolean medialink) { + return new Consumer(sb, bmk_user, autoTaggingQueue, autotag, empty, indexing, medialink); + } + + public class Consumer implements Runnable { + private final Switchboard sb; + private final String bmk_user; + private final ArrayBlockingQueue autoTaggingQueue; + private final String indexing; + + private final boolean autotag; + private final boolean empty; + private final boolean medialink; + + public Consumer(final Switchboard sb, final String bmk_user, final ArrayBlockingQueue autoTaggingQueue, + final boolean autotag, final boolean empty, final String indexing, final boolean medialink) { + this.sb = sb; + this.bmk_user = bmk_user; + this.autoTaggingQueue = autoTaggingQueue; + this.autotag = autotag; + this.empty = empty; + this.indexing = indexing; + this.medialink = medialink; + } + + public void run() { + YMarkEntry bmk; + while ((bmk = take()) != YMarkEntry.POISON) { + try { + final String url = bmk.get(YMarkEntry.BOOKMARK.URL.key()); + // other protocols could cause problems + if(url != null && url.startsWith("http")) { + sb.tables.bookmarks.addBookmark(bmk_user, bmk, true, true); + if(autotag) { + if(!empty) { + autoTaggingQueue.put(url); + } else if(!bmk.containsKey(YMarkEntry.BOOKMARK.TAGS.key()) || bmk.get(YMarkEntry.BOOKMARK.TAGS.key()).equals(YMarkEntry.BOOKMARK.TAGS.deflt())) { + autoTaggingQueue.put(url); + } + } + // fill crawler + if (indexing.equals("single")) { + bmk.crawl(YMarkCrawlStart.CRAWLSTART.SINGLE, medialink, sb); + } else if (indexing.equals("onelink")) { + bmk.crawl(YMarkCrawlStart.CRAWLSTART.ONE_LINK, medialink, sb); + } else if (indexing.equals("fulldomain")) { + bmk.crawl(YMarkCrawlStart.CRAWLSTART.FULL_DOMAIN, medialink, sb); + } + } + } catch (final IOException e) { + Log.logException(e); + } catch (final InterruptedException e) { + Log.logException(e); + } catch (SpaceExceededException e) { + Log.logException(e); + } + } + if(autotag) { + try { + autoTaggingQueue.put(YMarkAutoTagger.POISON); + Log.logInfo(YMarkTables.BOOKMARKS_LOG, importer+" inserted poison pill into autoTagging queue"); + } catch (final InterruptedException e) { + Log.logException(e); + } + } + } + } +} diff --git a/source/de/anomic/data/ymark/YMarkRDF.java b/source/de/anomic/data/ymark/YMarkRDF.java new file mode 100644 index 000000000..f3bdadb65 --- /dev/null +++ b/source/de/anomic/data/ymark/YMarkRDF.java @@ -0,0 +1,114 @@ +package de.anomic.data.ymark; + +import java.io.ByteArrayOutputStream; +import java.io.UnsupportedEncodingException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + +import net.yacy.cora.document.UTF8; +import net.yacy.cora.lod.vocabulary.AnnoteaA; +import net.yacy.cora.lod.vocabulary.AnnoteaB; +import net.yacy.cora.lod.vocabulary.DCElements; +import net.yacy.cora.lod.vocabulary.Rdf; +import net.yacy.kelondro.blob.Tables; + +import com.hp.hpl.jena.rdf.model.Model; +import com.hp.hpl.jena.rdf.model.ModelFactory; +import com.hp.hpl.jena.rdf.model.Property; +import com.hp.hpl.jena.rdf.model.Resource; + +public class YMarkRDF { + + public final Model model; + + public final static String USER = "USER"; + public final static String TYPE = "TYPE"; + private final Map property; + + public final static String BOOKMARK = "/Ymarks.rdf?id="; + private final StringBuilder resourceURI; + private final int len; + + public YMarkRDF(final String peerURI) { + this.model = ModelFactory.createDefaultModel(); + this.property = new HashMap(); + + this.len = peerURI.length()+BOOKMARK.length(); + this.resourceURI = new StringBuilder(len+20); + this.resourceURI.append(peerURI); + this.resourceURI.append(BOOKMARK); + + model.setNsPrefix(Rdf.PREFIX, Rdf.IDENTIFIER); + model.setNsPrefix(DCElements.PREFIX, DCElements.IDENTIFIER); + model.setNsPrefix(AnnoteaA.PREFIX, AnnoteaA.NAMESPACE); + model.setNsPrefix(AnnoteaB.PREFIX, AnnoteaB.NAMESPACE); + + this.property.put(YMarkEntry.BOOKMARK.URL.key(), this.model.createProperty(AnnoteaB.recalls.getNamespace(), AnnoteaB.recalls.name())); + this.property.put(YMarkEntry.BOOKMARK.FOLDERS.key(), this.model.createProperty(AnnoteaB.hasTopic.getNamespace(), AnnoteaB.hasTopic.name())); + this.property.put(YMarkEntry.BOOKMARK.TITLE.key(), this.model.createProperty(DCElements.title.getNamespace(), DCElements.title.name())); + this.property.put(YMarkEntry.BOOKMARK.DESC.key(), this.model.createProperty(DCElements.description.getNamespace(), DCElements.description.name())); + this.property.put(YMarkEntry.BOOKMARK.DATE_ADDED.key(), this.model.createProperty(AnnoteaA.created.getNamespace(), AnnoteaA.created.name())); + this.property.put(YMarkEntry.BOOKMARK.DATE_MODIFIED.key(), this.model.createProperty(DCElements.date.getNamespace(), DCElements.date.name())); + this.property.put(YMarkEntry.BOOKMARK.TAGS.key(), this.model.createProperty(DCElements.subject.getNamespace(), DCElements.subject.name())); + + this.property.put(USER, this.model.createProperty(DCElements.creator.getNamespace(), DCElements.creator.name())); + this.property.put(TYPE, this.model.createProperty(Rdf.type.getNamespace(), Rdf.type.name())); + } + + /** + * @param format {RDF/XML, RDF/XML-ABBREV, N-TRIPLE, N3, N3-PP, N3-PLAIN, N3-TRIPLE, TURTLE} + * @return RDF + */ + public String getRDF(final String format) { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + this.model.write(baos, format); + try { + return baos.toString("UTF-8"); + } catch (UnsupportedEncodingException e) { + return new String(); + } + } + + public void addBookmark (final String bmk_user, final Tables.Row bmk_row) { + if(bmk_row == null || bmk_row.get(YMarkEntry.BOOKMARK.PUBLIC.key(), YMarkEntry.BOOKMARK.PUBLIC.deflt()).equals("false")) + return; + final Resource bmk; + // create an annotea bookmark resource + this.resourceURI.append(bmk_user); + this.resourceURI.append(':'); + this.resourceURI.append(UTF8.String(bmk_row.getPK())); + bmk = this.model.createResource(this.resourceURI.toString()); + this.resourceURI.setLength(this.len); + + // add properties + bmk.addProperty(this.property.get(TYPE), AnnoteaB.Bookmark.getPredicate()); + bmk.addProperty(this.property.get(USER), bmk_user); + for (final YMarkEntry.BOOKMARK b : YMarkEntry.BOOKMARK.values()) { + switch(b) { + case FOLDERS: + final String[] folders = bmk_row.get(b.key(), b.deflt()).split(YMarkUtil.TAGS_SEPARATOR); + for(String folder : folders) { + bmk.addProperty(this.property.get(b.key()), folder); + // TODO add Topics to RDF + } + break; + case DATE_ADDED: + case DATE_MODIFIED: + final YMarkDate date = new YMarkDate(bmk_row.get(b.key())); + bmk.addProperty(this.property.get(b.key()), date.toISO8601()); + break; + default: + if(this.property.containsKey(b.key())) { + bmk.addProperty(this.property.get(b.key()), bmk_row.get(b.key(), b.deflt())); + } + } + } + } + + public void addBookmarks(final String bmk_user, final Iterator riter) { + while(riter.hasNext()) { + this.addBookmark(bmk_user, riter.next()); + } + } +} diff --git a/source/de/anomic/data/ymark/YMarkTables.java b/source/de/anomic/data/ymark/YMarkTables.java index da5470967..704995310 100644 --- a/source/de/anomic/data/ymark/YMarkTables.java +++ b/source/de/anomic/data/ymark/YMarkTables.java @@ -33,9 +33,13 @@ import java.util.EnumMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; +import java.util.Map; import java.util.TreeMap; import java.util.TreeSet; -import java.util.regex.Pattern; +import java.util.concurrent.ConcurrentHashMap; + +import javax.swing.event.ChangeEvent; +import javax.swing.event.ChangeListener; import net.yacy.cora.document.ASCII; import net.yacy.cora.util.SpaceExceededException; @@ -43,7 +47,9 @@ import net.yacy.document.Document; import net.yacy.document.Parser.Failure; import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.blob.Tables.Row; +import net.yacy.kelondro.blob.TablesColumnIndex; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.logging.Log; import net.yacy.repository.LoaderDispatcher; import de.anomic.data.WorkTables; @@ -89,19 +95,61 @@ public class YMarkTables { public final static String USER_ADMIN = "admin"; public final static String USER_AUTHENTICATE_MSG = "Bookmark user authentication required!"; - public final static String p1 = "(?:^|.*,)"; - public final static String p4 = "(?:,.*|$)"; - public final static String p5 = "((?:"; - public final static String p6 = ")(?:,.*|$)){"; - public final static String p7 = "/.*)"; - public final static String p8 = "(?:,|$)"; - public final static int BUFFER_LENGTH = 256; private final WorkTables worktables; + private final Map progressListeners; public YMarkTables(final Tables wt) { this.worktables = (WorkTables)wt; + this.progressListeners = new ConcurrentHashMap(); + this.buildIndex(); + } + + public ChangeListener getProgressListener(String thread) { + final ChangeListener l = new ProgressListener(); + this.progressListeners.put(thread, l); + return l; + } + + public void removeProgressListener(String thread) { + this.progressListeners.remove(thread); + } + + public class ProgressListener implements ChangeListener { + // the progress in % + private int progress = 0; + public void stateChanged(ChangeEvent e) { + final MonitoredReader mreader = (MonitoredReader)e.getSource(); + this.progress = (int)((mreader.getProgress() / mreader.maxProgress())*100); + } + public int progress() { + return this.progress; + } + } + + public void buildIndex() { + final Iterator iter = this.worktables.iterator(); + while(iter.hasNext()) { + final String bmk_table = iter.next(); + if(bmk_table.endsWith(TABLES.BOOKMARKS.basename())) { + try { + final long time = System.currentTimeMillis(); + final TablesColumnIndex index = this.worktables.getIndex(bmk_table); + if(index.getType() == TablesColumnIndex.INDEXTYPE.RAM || index.size() == 0) { + Log.logInfo(YMarkTables.BOOKMARKS_LOG, "buildIndex() "+YMarkEntry.BOOKMARK.indexColumns().keySet().toString()); + index.buildIndex(YMarkEntry.BOOKMARK.indexColumns(), this.worktables.iterator(bmk_table)); + Log.logInfo(YMarkTables.BOOKMARKS_LOG, "build "+index.getType().name()+" index for columns "+YMarkEntry.BOOKMARK.indexColumns().keySet().toString() + +" of table "+bmk_table+" containing "+this.worktables.size(bmk_table)+ " bookmarks" + +" ("+(System.currentTimeMillis()-time)+"ms)"); + } + } catch (IOException e) { + Log.logException(e); + } catch (Exception e) { + Log.logException(e); + } + } + } } public void deleteBookmark(final String bmk_user, final byte[] urlHash) throws IOException, SpaceExceededException { @@ -111,10 +159,18 @@ public class YMarkTables { if(bmk_row != null) { this.worktables.delete(bmk_table,urlHash); } + if(this.worktables.hasIndex(bmk_table, YMarkEntry.BOOKMARK.FOLDERS.key())) { + try { + this.worktables.getIndex(bmk_table).delete(urlHash); + } catch (Exception e) { + // nothing to do + } + } } public void deleteBookmark(final String bmk_user, final String url) throws IOException, SpaceExceededException { - this.deleteBookmark(bmk_user, YMarkUtil.getBookmarkId(url)); + final byte[] urlHash = YMarkUtil.getBookmarkId(url); + this.deleteBookmark(bmk_user, urlHash); } public TreeMap getTags(final Iterator rowIterator) { @@ -141,32 +197,65 @@ public class YMarkTables { public TreeMap getTags(final String bmk_user) throws IOException { final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); - final TreeMap tags = getTags(this.worktables.iterator(bmk_table)); - return tags; + final TreeMap tags = new TreeMap(); + if(this.worktables.hasIndex(bmk_table, YMarkEntry.BOOKMARK.TAGS.key())) { + try { + final TablesColumnIndex index = this.worktables.getIndex(bmk_table); + final Iterator iter = index.keySet(YMarkEntry.BOOKMARK.TAGS.key()).iterator(); + while(iter.hasNext()) { + final String tag = iter.next(); + tags.put(tag, new YMarkTag(tag, index.get(YMarkEntry.BOOKMARK.TAGS.key(), tag).size())); + } + return tags; + } catch (Exception e) { + // nothing to do + } + } + return getTags(this.worktables.iterator(bmk_table)); } - - public TreeSet getFolders(final String bmk_user, final String root) throws IOException { - final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); + public TreeSet getFolders(final String bmk_user, String root) throws IOException { final TreeSet folders = new TreeSet(); - final StringBuilder path = new StringBuilder(200); - final StringBuffer patternBuilder = new StringBuffer(BUFFER_LENGTH); - patternBuilder.setLength(0); - patternBuilder.append(p1); - patternBuilder.append('('); - patternBuilder.append(root); - patternBuilder.append(p7); - patternBuilder.append(p8); - final Pattern r = Pattern.compile(patternBuilder.toString()); - final Iterator bit = this.worktables.iterator(bmk_table, YMarkEntry.BOOKMARK.FOLDERS.key(), r); - Tables.Row bmk_row = null; - + final StringBuilder path = new StringBuilder(BUFFER_LENGTH); + final String r = root + YMarkUtil.FOLDERS_SEPARATOR; + final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); + + // if exists, try the index first + if(this.worktables.hasIndex(bmk_table, YMarkEntry.BOOKMARK.FOLDERS.key())) { + TablesColumnIndex index; + try { + index = this.worktables.getIndex(bmk_table); + final Iterator fiter = index.keySet(YMarkEntry.BOOKMARK.FOLDERS.key()).iterator(); + while(fiter.hasNext()) { + final String folder = fiter.next(); + if(folder.startsWith(r)) { + path.setLength(0); + path.append(folder); + while(path.length() > 0 && !path.toString().equals(root)){ + final String p = path.toString(); + if(folders.isEmpty() || !p.equals(folders.floor(p))) { + folders.add(p); + } + path.setLength(path.lastIndexOf(YMarkUtil.FOLDERS_SEPARATOR)); + } + } + } + if (!root.equals(YMarkTables.FOLDERS_ROOT)) { folders.add(root); } + return folders; + } catch (Exception e) { + Log.logException(e); + } + } + + // by default iterate all bookmarks and extract folder information + final Iterator bit = this.worktables.iterator(bmk_table); + Tables.Row bmk_row = null; while(bit.hasNext()) { bmk_row = bit.next(); - if(bmk_row.containsKey(YMarkEntry.BOOKMARK.FOLDERS.key())) { + if(bmk_row.containsKey(YMarkEntry.BOOKMARK.FOLDERS.key())) { final String[] folderArray = (new String(bmk_row.get(YMarkEntry.BOOKMARK.FOLDERS.key()),"UTF8")).split(YMarkUtil.TAGS_SEPARATOR); for (final String folder : folderArray) { - if(folder.length() > root.length() && folder.substring(0, root.length()+1).equals(root+'/')) { + if(folder.length() > root.length() && folder.substring(0, root.length()+1).equals(r)) { if(!folders.contains(folder)) { path.setLength(0); path.append(folder); @@ -178,42 +267,25 @@ public class YMarkTables { } } } - } + } } - if (!root.equals(YMarkTables.FOLDERS_ROOT)) { folders.add(root); } - return folders; + if (!root.equals(YMarkTables.FOLDERS_ROOT)) { folders.add(root); } + return folders; } - - public Iterator getBookmarksByFolder(final String bmk_user, final String folder) throws IOException { + + public int getSize(final String bmk_user) throws IOException { final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); - final StringBuilder patternBuilder = new StringBuilder(BUFFER_LENGTH); - patternBuilder.setLength(0); - patternBuilder.append(p1); - patternBuilder.append('('); - patternBuilder.append(Pattern.quote(folder)); - patternBuilder.append(')'); - patternBuilder.append(p4); - final Pattern p = Pattern.compile(patternBuilder.toString()); - return this.worktables.iterator(bmk_table, YMarkEntry.BOOKMARK.FOLDERS.key(), p); + return this.worktables.size(bmk_table); } - public Iterator getBookmarksByTag(final String bmk_user, final String[] tagArray) throws IOException { + public Iterator getBookmarksByFolder(final String bmk_user, final String foldersString) { final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); - final StringBuilder patternBuilder = new StringBuilder(BUFFER_LENGTH); - patternBuilder.setLength(0); - patternBuilder.append(p1); - patternBuilder.append(p5); - for (final String tag : tagArray) { - patternBuilder.append(Pattern.quote(tag)); - patternBuilder.append('|'); - } - patternBuilder.deleteCharAt(patternBuilder.length()-1); - patternBuilder.append(p6); + return this.worktables.getByIndex(bmk_table, YMarkEntry.BOOKMARK.FOLDERS.key(), YMarkEntry.BOOKMARK.FOLDERS.seperator(), foldersString); + } - patternBuilder.append(tagArray.length); - patternBuilder.append('}'); - final Pattern p = Pattern.compile(patternBuilder.toString(), Pattern.CASE_INSENSITIVE); - return this.worktables.iterator(bmk_table, YMarkEntry.BOOKMARK.TAGS.key(), p); + public Iterator getBookmarksByTag(final String bmk_user, final String tagsString) { + final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); + return this.worktables.getByIndex(bmk_table, YMarkEntry.BOOKMARK.TAGS.key(), YMarkEntry.BOOKMARK.TAGS.seperator(), tagsString); } public List orderBookmarksBy(final Iterator rowIterator, final String sortname, final String sortorder) { @@ -239,6 +311,7 @@ public class YMarkTables { } public void replaceTags(final Iterator rowIterator, final String bmk_user, final String tagString, final String replaceString) throws IOException { + final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); final HashSet remove = YMarkUtil.keysStringToSet(YMarkUtil.cleanTagsString(tagString.toLowerCase())); final StringBuilder t = new StringBuilder(200); HashSet tags; @@ -253,7 +326,14 @@ public class YMarkTables { t.append(YMarkUtil.TAGS_SEPARATOR); t.append(replaceString); row.put(YMarkEntry.BOOKMARK.TAGS.key(), YMarkUtil.cleanTagsString(t.toString())); - this.worktables.update(TABLES.BOOKMARKS.tablename(bmk_user), row); + this.worktables.update(bmk_table, row); + if(this.worktables.hasIndex(bmk_table)) { + try { + this.worktables.getIndex(bmk_table).update(YMarkEntry.BOOKMARK.TAGS.key(), YMarkEntry.BOOKMARK.TAGS.seperator(), row); + } catch (Exception e) { + // nothing to do + } + } } } @@ -334,6 +414,12 @@ public class YMarkTables { bmk.put(YMarkEntry.BOOKMARK.DATE_MODIFIED.key(), date); } this.worktables.insert(bmk_table, urlHash, bmk.getData()); + try { + if(this.worktables.hasIndex(bmk_table)) + this.worktables.getIndex(bmk_table).add(YMarkEntry.BOOKMARK.indexColumns(), bmk, urlHash); + } catch (Exception e) { + // nothing to do + } } else { // modify and update existing entry HashSet oldSet; @@ -343,7 +429,7 @@ public class YMarkTables { switch(b) { case DATE_ADDED: if(!bmk_row.containsKey(b.key())) - bmk_row.put(b.key(), date); + bmk_row.put(b.key(), date); break; case DATE_MODIFIED: bmk_row.put(b.key(), date); @@ -386,10 +472,16 @@ public class YMarkTables { } else { bmk_row.put(b.key(), bmk_row.get(b.key(), b.deflt())); } - } + } } // update bmk_table this.worktables.update(bmk_table, bmk_row); + try { + if(this.worktables.hasIndex(bmk_table)) + this.worktables.getIndex(bmk_table).update(YMarkEntry.BOOKMARK.indexColumns(), bmk_row); + } catch (Exception e) { + // nothing to do + } } } } diff --git a/source/de/anomic/data/ymark/YMarkUtil.java b/source/de/anomic/data/ymark/YMarkUtil.java index fda5f3d51..cfa350c9d 100644 --- a/source/de/anomic/data/ymark/YMarkUtil.java +++ b/source/de/anomic/data/ymark/YMarkUtil.java @@ -1,6 +1,6 @@ // YMarkUtil.java -// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany -// first published 2010 on http://yacy.net +// (C) 2011 by Stefan Foerster, sof@gmx.de, Norderstedt, Germany +// first published 2011 on http://yacy.net // // This is a part of YaCy, a peer-to-peer based web search engine // @@ -30,12 +30,14 @@ import java.net.MalformedURLException; import java.util.HashSet; import java.util.Iterator; +import net.yacy.cora.document.UTF8; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; public class YMarkUtil { public final static String TAGS_SEPARATOR = ","; public final static String FOLDERS_SEPARATOR = "/"; + public final static String SPACE = " "; public final static String EMPTY_STRING = new String(); /** @@ -60,7 +62,7 @@ public class YMarkUtil { } public final static byte[] keySetToBytes(final HashSet urlSet) { - return keySetToString(urlSet).getBytes(); + return UTF8.getBytes(keySetToString(urlSet)); } public final static String keySetToString(final HashSet urlSet) { @@ -106,17 +108,21 @@ public class YMarkUtil { ts.deleteCharAt(0); if (ts.length()>0 && ts.charAt(ts.length()-1) == TAGS_SEPARATOR.charAt(0)) ts.deleteCharAt(ts.length()-1); - return ts.toString(); + return new String(ts); } public final static String cleanFoldersString(final String foldersString) { return cleanFoldersString(foldersString, YMarkUtil.EMPTY_STRING); } - + public final static String cleanFoldersString(final String foldersString, final String dflt) { - if(foldersString.isEmpty()) + if(foldersString.isEmpty()) { return dflt; - StringBuilder fs = new StringBuilder(cleanTagsString(foldersString)); + } + return cleanFoldersString(new StringBuilder(cleanTagsString(foldersString))); + } + + public final static String cleanFoldersString(final StringBuilder fs) { if(fs.length() == 0) return YMarkEntry.BOOKMARK.FOLDERS.deflt(); for (int i = 0; i < fs.length()-1; i++) { @@ -132,7 +138,7 @@ public class YMarkUtil { } if (fs.charAt(fs.length()-1) == FOLDERS_SEPARATOR.charAt(0)) { fs.deleteCharAt(fs.length()-1); - } - return fs.toString(); + } + return new String(fs); } } diff --git a/source/de/anomic/data/ymark/YMarkXBELImporter.java b/source/de/anomic/data/ymark/YMarkXBELImporter.java index 128211a82..2041469c7 100644 --- a/source/de/anomic/data/ymark/YMarkXBELImporter.java +++ b/source/de/anomic/data/ymark/YMarkXBELImporter.java @@ -1,5 +1,5 @@ // YMarkXBELImporter.java -// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany +// (C) 2011 by Stefan Foerster, sof@gmx.de, Norderstedt, Germany // first published 2010 on http://yacy.net // // This is a part of YaCy, a peer-to-peer based web search engine @@ -9,7 +9,7 @@ // $LastChangedBy$ // // LICENSE -// +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -26,35 +26,24 @@ package de.anomic.data.ymark; -import java.io.IOException; -import java.io.Reader; import java.text.ParseException; import java.util.HashMap; import java.util.HashSet; -import java.util.concurrent.ArrayBlockingQueue; import net.yacy.kelondro.logging.Log; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; -import org.xml.sax.SAXParseException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; -public class YMarkXBELImporter extends DefaultHandler implements Runnable { - - // Importer Variables - private final ArrayBlockingQueue bookmarks; - private final Reader bmk_file; - private final String RootFolder; - private final StringBuilder folderstring; - private YMarkEntry bmk; - private final XMLReader xmlReader; - - // Statics - public static enum XBEL { +public class YMarkXBELImporter extends YMarkImporter { + + // Statics + public static String IMPORTER = "XBEL"; + public static enum XBEL { NOTHING (""), XBEL ("'); return buffer.toString(); } } - - // Parser Variables - private final HashMap bmkRef; - private final HashSet aliasRef; - private final StringBuilder buffer; - private final StringBuilder folder; - - private YMarkEntry ref; - private XBEL outer_state; // BOOKMARK, FOLDER, NOTHING - private XBEL inner_state; // DESC, TITLE, INFO, ALIAS, (METADATA), NOTHING - private boolean parse_value; - - public YMarkXBELImporter (final Reader bmk_file, final int queueSize, final String root) throws SAXException { - this.bookmarks = new ArrayBlockingQueue(queueSize); - this.bmk_file = bmk_file; - this.RootFolder = root; - this.folderstring = new StringBuilder(YMarkTables.BUFFER_LENGTH); - this.folderstring.append(this.RootFolder); - this.bmk = new YMarkEntry(); - - this.xmlReader = XMLReaderFactory.createXMLReader(); - this.xmlReader.setContentHandler(this); - this.xmlReader.setFeature("http://xml.org/sax/features/namespace-prefixes", false); - this.xmlReader.setFeature("http://xml.org/sax/features/namespaces", false); - this.xmlReader.setFeature("http://xml.org/sax/features/validation", false); - - this.bmkRef = new HashMap(); - this.aliasRef = new HashSet(); - this.buffer = new StringBuilder(); - this.folder = new StringBuilder(YMarkTables.BUFFER_LENGTH); - this.folder.append(this.RootFolder); - } - - @Override - public void run() { - try { - this.xmlReader.parse(new InputSource(this.bmk_file)); - } catch (SAXParseException e) { - Log.logException(e); - } catch (SAXException e) { - Log.logException(e); - } catch (IOException e) { - Log.logException(e); - } finally { - try { - Log.logInfo(YMarkTables.BOOKMARKS_LOG, "XBEL Importer inserted poison pill in queue"); - this.bookmarks.put(YMarkEntry.POISON); - } catch (InterruptedException e1) { - Log.logException(e1); - } - } - } - - @Override - public void endDocument() throws SAXException { - // put alias references in the bookmark queue to ensure that folders get updated - // we do that at endDocument to ensure all referenced bookmarks already exist - this.bookmarks.addAll(this.aliasRef); - this.aliasRef.clear(); - this.bmkRef.clear(); - } - - @Override - public void startElement(final String uri, final String name, String tag, final Attributes atts) throws SAXException { - YMarkDate date = new YMarkDate(); - if (tag == null) return; - tag = tag.toLowerCase(); - if (XBEL.BOOKMARK.tag().equals(tag)) { - this.bmk = new YMarkEntry(); - this.bmk.put(YMarkEntry.BOOKMARK.URL.key(), atts.getValue(uri, YMarkEntry.BOOKMARK.URL.xbel_attrb())); - //TODO: include a dynamic loop over all annotation tags - this.bmk.put(YMarkEntry.BOOKMARK.TAGS.key(), atts.getValue(uri, YMarkEntry.BOOKMARK.TAGS.xbel_attrb())); - this.bmk.put(YMarkEntry.BOOKMARK.PUBLIC.key(), atts.getValue(uri, YMarkEntry.BOOKMARK.PUBLIC.xbel_attrb())); - this.bmk.put(YMarkEntry.BOOKMARK.VISITS.key(), atts.getValue(uri, YMarkEntry.BOOKMARK.VISITS.xbel_attrb())); - try { - date.parseISO8601(atts.getValue(uri, YMarkEntry.BOOKMARK.DATE_ADDED.xbel_attrb())); - } catch (ParseException e) { - // TODO: exception handling - } - this.bmk.put(YMarkEntry.BOOKMARK.DATE_ADDED.key(), date.toString()); - try { - date.parseISO8601(atts.getValue(uri, YMarkEntry.BOOKMARK.DATE_VISITED.xbel_attrb())); - } catch (ParseException e) { - // TODO: exception handling - } - this.bmk.put(YMarkEntry.BOOKMARK.DATE_VISITED.key(), date.toString()); - try { - date.parseISO8601(atts.getValue(uri, YMarkEntry.BOOKMARK.DATE_MODIFIED.xbel_attrb())); - } catch (ParseException e) { - // TODO: exception handling - } - this.bmk.put(YMarkEntry.BOOKMARK.DATE_MODIFIED.key(), date.toString()); - UpdateBmkRef(atts.getValue(uri, YMarkEntry.BOOKMARKS_ID), true); - this.outer_state = XBEL.BOOKMARK; - this.inner_state = XBEL.NOTHING; - this.parse_value = false; - } else if(XBEL.FOLDER.tag().equals(tag)) { - this.outer_state = XBEL.FOLDER; - this.inner_state = XBEL.NOTHING; - } else if (XBEL.DESC.tag().equals(tag)) { - this.inner_state = XBEL.DESC; - this.parse_value = true; - } else if (XBEL.TITLE.tag().equals(tag)) { - this.inner_state = XBEL.TITLE; - this.parse_value = true; - } else if (XBEL.INFO.tag().equals(tag)) { - this.inner_state = XBEL.INFO; - this.parse_value = false; - } else if (XBEL.METADATA.tag().equals(tag)) { - // Support for old YaCy BookmarksDB XBEL Metadata (non valid XBEL) - if(this.outer_state == XBEL.BOOKMARK) { - final boolean isMozillaShortcutURL = atts.getValue(uri, "owner").equals("Mozilla") && !atts.getValue(uri, "ShortcutURL").isEmpty(); - final boolean isYacyPublic = atts.getValue(uri, "owner").equals("YaCy") && !atts.getValue(uri, "public").isEmpty(); - if(isMozillaShortcutURL) - this.bmk.put(YMarkEntry.BOOKMARK.TAGS.key(), YMarkUtil.cleanTagsString(atts.getValue(uri, "ShortcutURL"))); - if(isYacyPublic) - this.bmk.put(YMarkEntry.BOOKMARK.PUBLIC.key(), atts.getValue(uri, "public")); - } - } else if (XBEL.ALIAS.tag().equals(tag)) { - final String r = atts.getValue(uri, YMarkEntry.BOOKMARKS_REF); - UpdateBmkRef(r, false); - this.aliasRef.add(this.bmkRef.get(r)); - } - else { - this.outer_state = XBEL.NOTHING; - this.inner_state = XBEL.NOTHING; - this.parse_value = false; - } - } - - @Override - public void endElement(final String uri, final String name, String tag) { - if (tag == null) return; - tag = tag.toLowerCase(); - if(XBEL.BOOKMARK.tag().equals(tag)) { - // write bookmark - if (!this.bmk.isEmpty()) { - this.bmk.put(YMarkEntry.BOOKMARK.FOLDERS.key(), this.folder.toString()); - try { - this.bookmarks.put(this.bmk); - this.bmk = new YMarkEntry(); - } catch (InterruptedException e) { - Log.logException(e); + + // Importer Variables + private final XMLReader xmlReader; + + public YMarkXBELImporter (final MonitoredReader bmk_file, final int queueSize, final String targetFolder, final String sourceFolder) throws SAXException { + super(bmk_file, queueSize, targetFolder, sourceFolder); + setImporter(IMPORTER); + this.xmlReader = XMLReaderFactory.createXMLReader(); + this.xmlReader.setFeature(XML_NAMESPACE_PREFIXES, false); + this.xmlReader.setFeature(XML_NAMESPACES, false); + this.xmlReader.setFeature(XML_VALIDATION, false); + this.xmlReader.setContentHandler(new XBELParser()); + } + + public YMarkXBELImporter (final MonitoredReader bmk_file, final int queueSize, final String targetFolder) throws SAXException { + this(bmk_file, queueSize, "", targetFolder); + } + + public void parse() throws Exception { + xmlReader.parse(new InputSource(bmk_file)); + } + + public class XBELParser extends DefaultHandler { + + // Parser Variables + private final StringBuilder folderstring; + private final HashMap bmkRef; + private final HashSet aliasRef; + private final StringBuilder buffer; + private final StringBuilder folder; + + private YMarkEntry bmk; + private YMarkEntry ref; + private XBEL outer_state; // BOOKMARK, FOLDER, NOTHING + private XBEL inner_state; // DESC, TITLE, INFO, ALIAS, (METADATA), NOTHING + private boolean parse_value; + + + public XBELParser() { + this.folderstring = new StringBuilder(YMarkTables.BUFFER_LENGTH); + this.folderstring.append(targetFolder); + this.bmk = new YMarkEntry(); + this.bmkRef = new HashMap(); + this.aliasRef = new HashSet(); + this.buffer = new StringBuilder(); + this.folder = new StringBuilder(YMarkTables.BUFFER_LENGTH); + this.folder.append(targetFolder); + } + + public void endDocument() throws SAXException { + // put alias references in the bookmark queue to ensure that folders get updated + // we do that at endDocument to ensure all referenced bookmarks already exist + bookmarks.addAll(this.aliasRef); + this.aliasRef.clear(); + this.bmkRef.clear(); + } + + public void startElement(final String uri, final String name, String tag, final Attributes atts) throws SAXException { + YMarkDate date = new YMarkDate(); + if (tag == null) return; + tag = tag.toLowerCase(); + if (XBEL.BOOKMARK.tag().equals(tag)) { + this.bmk = new YMarkEntry(); + this.bmk.put(YMarkEntry.BOOKMARK.URL.key(), atts.getValue(uri, YMarkEntry.BOOKMARK.URL.xbel_attrb())); + //TODO: include a dynamic loop over all annotation tags + this.bmk.put(YMarkEntry.BOOKMARK.TAGS.key(), atts.getValue(uri, YMarkEntry.BOOKMARK.TAGS.xbel_attrb())); + this.bmk.put(YMarkEntry.BOOKMARK.PUBLIC.key(), atts.getValue(uri, YMarkEntry.BOOKMARK.PUBLIC.xbel_attrb())); + this.bmk.put(YMarkEntry.BOOKMARK.VISITS.key(), atts.getValue(uri, YMarkEntry.BOOKMARK.VISITS.xbel_attrb())); + try { + date.parseISO8601(atts.getValue(uri, YMarkEntry.BOOKMARK.DATE_ADDED.xbel_attrb())); + } catch (ParseException e) { + // TODO: exception handling } - } - this.outer_state = XBEL.FOLDER; - } else if (XBEL.FOLDER.tag().equals(tag)) { - // go up one folder - //TODO: get rid of .toString.equals() - if(!this.folder.toString().equals(this.RootFolder)) { - this.folder.setLength(this.folder.lastIndexOf(YMarkUtil.FOLDERS_SEPARATOR)); - } - this.outer_state = XBEL.FOLDER; - } else if (XBEL.INFO.tag().equals(tag)) { - this.inner_state = XBEL.NOTHING; - } else if (XBEL.METADATA.tag().equals(tag)) { - this.inner_state = XBEL.INFO; - } - } - - @Override - public void characters(final char ch[], final int start, final int length) { - if (this.parse_value) { - this.buffer.append(ch, start, length); - switch(this.outer_state) { - case BOOKMARK: - switch(this.inner_state) { - case DESC: - this.bmk.put(YMarkEntry.BOOKMARK.DESC.key(), this.buffer.toString().trim()); - break; - case TITLE: - this.bmk.put(YMarkEntry.BOOKMARK.TITLE.key(), this.buffer.toString().trim()); - break; - default: - break; - } - break; - case FOLDER: - switch(this.inner_state) { - case DESC: - break; - case TITLE: - this.folder.append(YMarkUtil.FOLDERS_SEPARATOR); - this.folder.append(this.buffer); - break; - default: - break; - } - break; - default: - break; - } - this.buffer.setLength(0); - this.parse_value = false; - } - } - - public YMarkEntry take() { - try { - return this.bookmarks.take(); - } catch (InterruptedException e) { - Log.logException(e); - return null; - } - } + this.bmk.put(YMarkEntry.BOOKMARK.DATE_ADDED.key(), date.toString()); + try { + date.parseISO8601(atts.getValue(uri, YMarkEntry.BOOKMARK.DATE_VISITED.xbel_attrb())); + } catch (ParseException e) { + // TODO: exception handling + } + this.bmk.put(YMarkEntry.BOOKMARK.DATE_VISITED.key(), date.toString()); + try { + date.parseISO8601(atts.getValue(uri, YMarkEntry.BOOKMARK.DATE_MODIFIED.xbel_attrb())); + } catch (ParseException e) { + // TODO: exception handling + } + this.bmk.put(YMarkEntry.BOOKMARK.DATE_MODIFIED.key(), date.toString()); + UpdateBmkRef(atts.getValue(uri, YMarkEntry.BOOKMARKS_ID), true); + this.outer_state = XBEL.BOOKMARK; + this.inner_state = XBEL.NOTHING; + this.parse_value = false; + } else if(XBEL.FOLDER.tag().equals(tag)) { + this.outer_state = XBEL.FOLDER; + this.inner_state = XBEL.NOTHING; + } else if (XBEL.DESC.tag().equals(tag)) { + this.inner_state = XBEL.DESC; + this.parse_value = true; + } else if (XBEL.TITLE.tag().equals(tag)) { + this.inner_state = XBEL.TITLE; + this.parse_value = true; + } else if (XBEL.INFO.tag().equals(tag)) { + this.inner_state = XBEL.INFO; + this.parse_value = false; + } else if (XBEL.METADATA.tag().equals(tag)) { + // Support for old YaCy BookmarksDB XBEL Metadata (non valid XBEL) + if(this.outer_state == XBEL.BOOKMARK) { + final boolean isMozillaShortcutURL = atts.getValue(uri, "owner").equals("Mozilla") && !atts.getValue(uri, "ShortcutURL").isEmpty(); + final boolean isYacyPublic = atts.getValue(uri, "owner").equals("YaCy") && !atts.getValue(uri, "public").isEmpty(); + if(isMozillaShortcutURL) + this.bmk.put(YMarkEntry.BOOKMARK.TAGS.key(), YMarkUtil.cleanTagsString(atts.getValue(uri, "ShortcutURL"))); + if(isYacyPublic) + this.bmk.put(YMarkEntry.BOOKMARK.PUBLIC.key(), atts.getValue(uri, "public")); + } + } else if (XBEL.ALIAS.tag().equals(tag)) { + final String r = atts.getValue(uri, YMarkEntry.BOOKMARKS_REF); + UpdateBmkRef(r, false); + this.aliasRef.add(this.bmkRef.get(r)); + } + else { + this.outer_state = XBEL.NOTHING; + this.inner_state = XBEL.NOTHING; + this.parse_value = false; + } + } - private void UpdateBmkRef(final String id, final boolean url) { - this.folderstring.setLength(0); + public void endElement(final String uri, final String name, String tag) { + if (tag == null) return; + tag = tag.toLowerCase(); + if(XBEL.BOOKMARK.tag().equals(tag)) { + // write bookmark + if (!this.bmk.isEmpty()) { + this.bmk.put(YMarkEntry.BOOKMARK.FOLDERS.key(), this.folder.toString()); + try { + bookmarks.put(this.bmk); + bmk = new YMarkEntry(); + } catch (InterruptedException e) { + Log.logException(e); + } + } + this.outer_state = XBEL.FOLDER; + } else if (XBEL.FOLDER.tag().equals(tag)) { + // go up one folder + //TODO: get rid of .toString.equals() + if(!this.folder.toString().equals(targetFolder)) { + folder.setLength(folder.lastIndexOf(YMarkUtil.FOLDERS_SEPARATOR)); + } + this.outer_state = XBEL.FOLDER; + } else if (XBEL.INFO.tag().equals(tag)) { + this.inner_state = XBEL.NOTHING; + } else if (XBEL.METADATA.tag().equals(tag)) { + this.inner_state = XBEL.INFO; + } + } - if(this.bmkRef.containsKey(id)) { - this.folderstring.append(this.bmkRef.get(id).get(YMarkEntry.BOOKMARK.FOLDERS.key())); - this.folderstring.append(','); - this.ref = this.bmkRef.get(id); - } else { - this.ref = new YMarkEntry(); - } - this.folderstring.append(this.folder); - if(url) - this.ref.put(YMarkEntry.BOOKMARK.URL.key(), this.bmk.get(YMarkEntry.BOOKMARK.URL.key())); - this.ref.put(YMarkEntry.BOOKMARK.FOLDERS.key(), this.folderstring.toString()); - this.bmkRef.put(id, this.ref); - } + public void characters(final char ch[], final int start, final int length) { + // TODO move string processing to endElement as characters() could be called more than once per tag + if (parse_value) { + buffer.append(ch, start, length); + switch(outer_state) { + case BOOKMARK: + switch(inner_state) { + case DESC: + this.bmk.put(YMarkEntry.BOOKMARK.DESC.key(), buffer.toString().trim()); + break; + case TITLE: + this.bmk.put(YMarkEntry.BOOKMARK.TITLE.key(), buffer.toString().trim()); + break; + default: + break; + } + break; + case FOLDER: + switch(inner_state) { + case DESC: + break; + case TITLE: + this.folder.append(YMarkUtil.FOLDERS_SEPARATOR); + this.folder.append(this.buffer); + break; + default: + break; + } + break; + default: + break; + } + this.buffer.setLength(0); + this.parse_value = false; + } + } + + private void UpdateBmkRef(final String id, final boolean url) { + this.folderstring.setLength(0); + + if(this.bmkRef.containsKey(id)) { + this.folderstring.append(this.bmkRef.get(id).get(YMarkEntry.BOOKMARK.FOLDERS.key())); + this.folderstring.append(','); + this.ref = this.bmkRef.get(id); + } else { + this.ref = new YMarkEntry(); + } + this.folderstring.append(this.folder); + if(url) + this.ref.put(YMarkEntry.BOOKMARK.URL.key(), this.bmk.get(YMarkEntry.BOOKMARK.URL.key())); + this.ref.put(YMarkEntry.BOOKMARK.FOLDERS.key(), this.folderstring.toString()); + this.bmkRef.put(id, ref); + } + } } diff --git a/source/net/yacy/cora/lod/vocabulary/AnnoteaA.java b/source/net/yacy/cora/lod/vocabulary/AnnoteaA.java new file mode 100644 index 000000000..eb0e2fef2 --- /dev/null +++ b/source/net/yacy/cora/lod/vocabulary/AnnoteaA.java @@ -0,0 +1,92 @@ +/** + * AnnoteaA + * Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany + * First released 16.12.2011 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +package net.yacy.cora.lod.vocabulary; + +import java.util.Set; + +import net.yacy.cora.lod.Literal; +import net.yacy.cora.lod.Vocabulary; + + +/** + * Annotea [Annotea] is a W3C Semantic Web Advanced Development project that + * provides a framework for rich communication about Web pages through shared RDF metadata. + * + * The Annotea Annotation schema [AnnotationNS] defines properties for identifying + * the document being annotated, a specific context within that document to which + * the body of the annotation refers, the author of the annotation, and more. + * + * http://www.w3.org/2003/07/Annotea/BookmarkSchema-20030707 + */ +public enum AnnoteaA implements Vocabulary { + + Annotation, // The target type of a annotation resource. + + annotates, // Relates an Annotation to the resource to which the Annotation applies. The inverse relation is 'hasAnnotation' + + author, // The name of the person or organization most responsible for creating the Annotation. Sub property of dc:creator + + body, // Relates the resource representing the 'content' of an Annotation to the Annotation resourceSub property of related + + context, // The context within the resource named in 'annotates' to which the Annotation most directly applies + + created, // The date and time on which the Annotation was created. yyyy-mm-ddThh:mm:ssZ format recommended.Sub property of dc:date + + modified, // The date and time on which the Annotation was modified. yyyy-mm-ddThh:mm:ssZ format recommended.Sub property of dc:date + + related; // A relationship between an annotation and additional resources that is less specific than 'body'. + // The 'related' property is expected to be subclassed by more specific relationships + + public final static String NAMESPACE = "http://www.w3.org/2000/10/annotation-ns#"; + public final static String PREFIX = "a"; + + private final String predicate; + + private AnnoteaA() { + this.predicate = NAMESPACE + this.name(); + } + + @Override + public String getNamespace() { + return NAMESPACE; + } + + @Override + public String getNamespacePrefix() { + return PREFIX; + } + + @Override + public Set getLiterals() { + return null; + } + + @Override + public String getPredicate() { + return this.predicate; + } + + @Override + public String getURIref() { + return PREFIX + ':' + this.name(); + } +} diff --git a/source/net/yacy/cora/lod/vocabulary/AnnoteaB.java b/source/net/yacy/cora/lod/vocabulary/AnnoteaB.java new file mode 100644 index 000000000..3591c6d56 --- /dev/null +++ b/source/net/yacy/cora/lod/vocabulary/AnnoteaB.java @@ -0,0 +1,97 @@ +/** + * AnnoteaB + * Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany + * First released 16.12.2011 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +package net.yacy.cora.lod.vocabulary; + +import java.util.Set; + +import net.yacy.cora.lod.Literal; +import net.yacy.cora.lod.Vocabulary; + + +/** + * Annotea [Annotea] is a W3C Semantic Web Advanced Development project that + * provides a framework for rich communication about Web pages through shared RDF metadata. + * + * The Annotea Bookmark schema [BookmarkNS] provides the basic concepts found in common browser bookmark implementations. + * These basic concepts are also captured in the XML Bookmark Exchange Language [XBEL]. + * The use of RDF in Annotea permits bookmarks to express additional semantics. + * XBEL can be easily mapped into this schema. + * + * http://www.w3.org/2003/07/Annotea/BookmarkSchema-20030707 + */ +public enum AnnoteaB implements Vocabulary { + + Bookmark, // The class to which all bookmarks belong + + Shortcut, // Specifies a behavior; when the object of type 'Shortcut' is activated, the client follows the 'recalls' property + // and activates the object at the end of that 'recalls' property. The target object may be another Bookmark or may be a Topic. + + Topic, // + + bookmarks, // This corresponds to XBEL:href an object of type Bookmark is expected to have a 'recalls' relationship to the document being bookmarked. + // The 'bookmarks' property is an older name for the 'recalls' relationship. + + hasTopic, // relates a bookmark to a topic. A bookmark must have at least one hasTopic property. The typical user operation of following a bookmark link + // will use the value of the b:recalls property. This property corresponds to XBEL:href property.An instance of + leadsTo, // connects a Shortcut to the bookmark or topic that is being included by reference in some other topic + + recalls, // Relates a bookmark with the resource that has been bookmarked. This corresponds to XBEL:href; + // an object of type Bookmark is expected to have a 'recalls' relationship to the document being bookmarked + + subTopicOf; // Describes a relationship between Topics. When a topic T is a sub-topic of a topic U then all bookmarks that have topic T are also considered to have topic U. + // A topic may be a sub-topic of one or more topics; trivially, every topic is a sub-topic of itself. + // More formally; for all B, T, and U: b b:hasTopic T, T b:subTopicOf U implies B b:hasTopic U. + + public final static String NAMESPACE = "http://www.w3.org/2002/01/bookmark#"; + public final static String PREFIX = "b"; + + private final String predicate; + + private AnnoteaB() { + this.predicate = NAMESPACE + this.name(); + } + + @Override + public String getNamespace() { + return NAMESPACE; + } + + @Override + public String getNamespacePrefix() { + return PREFIX; + } + + @Override + public Set getLiterals() { + return null; + } + + @Override + public String getPredicate() { + return this.predicate; + } + + @Override + public String getURIref() { + return PREFIX + ':' + this.name(); + } +} diff --git a/source/net/yacy/cora/lod/vocabulary/DCElements.java b/source/net/yacy/cora/lod/vocabulary/DCElements.java new file mode 100644 index 000000000..2a6a0b8a9 --- /dev/null +++ b/source/net/yacy/cora/lod/vocabulary/DCElements.java @@ -0,0 +1,76 @@ +/** + * DublinCore + * Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany + * First released 16.12.2011 at http://yacy.net + * + * $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $ + * $LastChangedRevision: 7653 $ + * $LastChangedBy: orbiter $ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +package net.yacy.cora.lod.vocabulary; + +import java.util.Set; + +import net.yacy.cora.lod.Literal; +import net.yacy.cora.lod.Vocabulary; + +public enum DCElements implements Vocabulary { + + + creator, + date, + description, + subject, + title; + + public final static String IDENTIFIER = "http://purl.org/dc/elements/1.1/"; + public final static String PREFIX = "dc"; + + private final String predicate, uriref; + + private DCElements() { + this.predicate = IDENTIFIER + this.name().toLowerCase(); + this.uriref = PREFIX + ':' + this.name().toLowerCase(); + } + + @Override + public String getNamespace() { + return IDENTIFIER; + } + + @Override + public String getNamespacePrefix() { + return PREFIX; + } + + @Override + public Set getLiterals() { + return null; + } + + @Override + public String getPredicate() { + return this.predicate; + } + + @Override + public String getURIref() { + return this.uriref; + } +} diff --git a/source/net/yacy/cora/lod/vocabulary/DMOZ.java b/source/net/yacy/cora/lod/vocabulary/DMOZ.java new file mode 100644 index 000000000..74080a26c --- /dev/null +++ b/source/net/yacy/cora/lod/vocabulary/DMOZ.java @@ -0,0 +1,104 @@ +/** + * DMOZ + * Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany + * First released 16.12.2011 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +package net.yacy.cora.lod.vocabulary; + +import java.util.Set; + +import net.yacy.cora.lod.Literal; +import net.yacy.cora.lod.Vocabulary; + +/** + * The Open Directory Project is the largest, most comprehensive human-edited directory of the Web. + * It is constructed and maintained by a vast, global community of volunteer editors. + * + * RDF dumps of the Open Directory database are available for download at http://www.dmoz.org/rdf.html * + * An overview of the vocabulary can be found at http://rdf.dmoz.org/rdf/tags.html + */ +public enum DMOZ implements Vocabulary { + + // Content + ExternalPage, + atom, + link, + link1, + mediadate, + pdf, + pdf1, + priority, + rss, + rss1, + topic, + type, + + // Structure + Alias, + Target, + Topic, + altlang, + altlang1, + catid, + editor, + lastUpdate, + letterbar, + narrow, + narrow1, + narrow2, + newsgroup, + related, + symbolic, + symbolic1, + symbolic2; + + public final static String NAMESPACE = "http://dmoz.org/rdf/"; + public final static String PREFIX = "dmoz"; + + private final String predicate; + + private DMOZ() { + this.predicate = NAMESPACE + this.name().toLowerCase(); + } + + @Override + public String getNamespace() { + return NAMESPACE; + } + + @Override + public String getNamespacePrefix() { + return PREFIX; + } + + @Override + public Set getLiterals() { + return null; + } + + @Override + public String getPredicate() { + return this.predicate; + } + + @Override + public String getURIref() { + return PREFIX + ':' + this.name(); + } +} diff --git a/source/net/yacy/cora/lod/vocabulary/Rdf.java b/source/net/yacy/cora/lod/vocabulary/Rdf.java index a7a74935a..9b394d5d4 100644 --- a/source/net/yacy/cora/lod/vocabulary/Rdf.java +++ b/source/net/yacy/cora/lod/vocabulary/Rdf.java @@ -35,7 +35,8 @@ public enum Rdf implements Vocabulary { Description, Bag, Seq, - Alt; + Alt, + type; public final static String IDENTIFIER = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; public final static String PREFIX = "rdf"; diff --git a/source/net/yacy/kelondro/blob/Tables.java b/source/net/yacy/kelondro/blob/Tables.java index 66f24d8c8..04130ed58 100644 --- a/source/net/yacy/kelondro/blob/Tables.java +++ b/source/net/yacy/kelondro/blob/Tables.java @@ -34,10 +34,12 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.Map; import java.util.TreeMap; +import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; @@ -50,16 +52,25 @@ import net.yacy.kelondro.util.ByteArray; import net.yacy.kelondro.util.ByteBuffer; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.LookAheadIterator; +import de.anomic.data.ymark.YMarkUtil; public class Tables implements Iterable { - private static final String suffix = ".bheap"; + public final static String p1 = "(?:^|.*,)"; + public final static String p2 = "((?:"; + public final static String p3 = ")(?:,.*|$)){"; + public final static String CIDX = "_cidx"; + public final static int NOINDEX = 50000; + public final static int RAMINDEX = 100000; + + private static final String suffix = ".bheap"; private static final String system_table_pkcounter = "pkcounter"; private static final String system_table_pkcounter_counterName = "pk"; private final File location; private final ConcurrentHashMap tables; + private final ConcurrentHashMap cidx; int keymaxlen; // use our own formatter to prevent concurrency locks with other processes @@ -82,6 +93,143 @@ public class Tables implements Iterable { } } } + this.cidx = new ConcurrentHashMap(); + } + + public TablesColumnIndex getIndex(final String tableName, TablesColumnIndex.INDEXTYPE indexType) throws Exception { + final TablesColumnIndex index; + switch(indexType) { + case RAM: + index = new TablesColumnRAMIndex(); + break; + case BLOB: + final String idx_table = tableName+CIDX; + BEncodedHeap bheap; + bheap = this.getHeap(idx_table); + index = new TablesColumnBLOBIndex(bheap); + break; + default: + throw new Exception("Unsupported TableColumnIndex: "+indexType.name()); + } + return index; + } + + public TablesColumnIndex getIndex(final String tableName) throws Exception { + // return an existing index + if(this.cidx.containsKey(tableName)) { + return this.cidx.get(tableName); + } + + // create a new index + int size; + try { + size = this.size(tableName); + } catch (IOException e) { + size = 0; + } + + final TablesColumnIndex index; + + if(size < NOINDEX) { + throw new Exception("TableColumnIndex not available for tables with less than "+NOINDEX+" rows: "+tableName); + } + if(size < RAMINDEX) { + index = new TablesColumnRAMIndex(); + } else { + final String idx_table = tableName+CIDX; + BEncodedHeap bheap; + try { + bheap = this.getHeap(idx_table); + } catch (IOException e) { + bheap = null; + Log.logException(e); + } + if(bheap != null) { + index = new TablesColumnBLOBIndex(bheap); + } else { + index = new TablesColumnRAMIndex(); + } + } + this.cidx.put(tableName, index); + return index; + } + + public boolean hasIndex (final String tableName) { + return this.cidx.contains(tableName); + } + + public boolean hasIndex (final String tableName, final String columnName) { + if(this.cidx.containsKey(tableName)) { + return this.cidx.get(tableName).hasIndex(columnName); + } + try { + if(this.has(tableName+CIDX, YMarkUtil.getKeyId(columnName))) { + return true; + } + } catch (IOException e) { + Log.logException(e); + } + return false; + } + + public Iterator getByIndex(final String table, final String whereColumn, final String separator, final String whereValue) { + final HashSet rows = new HashSet(); + final TreeSet set1 = new TreeSet(TablesColumnIndex.NATURALORDER); + final TreeSet set2 = new TreeSet(TablesColumnIndex.NATURALORDER); + final String[] values = whereValue.split(separator); + if(this.hasIndex(table, whereColumn)) { + try { + final TablesColumnIndex index = this.getIndex(table); + for(int i=0; i biter = index.get(whereColumn, values[i]).iterator(); + while(biter.hasNext()) { + set1.add(biter.next()); + } + if(i==0) { + set2.addAll(set1); + } else { + set2.retainAll(set1); + } + set1.clear(); + } + } + for(byte[] pk : set2) { + rows.add(this.select(table, pk)); + } + + } catch (Exception e) { + Log.logException(e); + return new HashSet().iterator(); + } + } else if (!separator.isEmpty()) { + final StringBuilder patternBuilder = new StringBuilder(256); + patternBuilder.append(p1); + patternBuilder.append(p2); + for (final String value : values) { + patternBuilder.append(Pattern.quote(value)); + patternBuilder.append('|'); + } + patternBuilder.deleteCharAt(patternBuilder.length()-1); + patternBuilder.append(p3); + patternBuilder.append(values.length); + patternBuilder.append('}'); + final Pattern p = Pattern.compile(patternBuilder.toString(), Pattern.CASE_INSENSITIVE); + try { + return this.iterator(table, whereColumn, p); + } catch (IOException e) { + Log.logException(e); + return new HashSet().iterator(); + } + } else { + try { + return this.iterator(table, whereColumn, UTF8.getBytes(whereValue)); + } catch (IOException e) { + Log.logException(e); + return new HashSet().iterator(); + } + } + return rows.iterator(); } @Override diff --git a/source/net/yacy/kelondro/blob/TablesColumnBLOBIndex.java b/source/net/yacy/kelondro/blob/TablesColumnBLOBIndex.java new file mode 100644 index 000000000..98710f848 --- /dev/null +++ b/source/net/yacy/kelondro/blob/TablesColumnBLOBIndex.java @@ -0,0 +1,205 @@ +// TablesColumnBLOBIndex.java +// (C) 2012 by Stefan Foerster, sof@gmx.de, Norderstedt, Germany +// first published 2012 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.kelondro.blob; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import java.util.concurrent.ConcurrentHashMap; + +import net.yacy.cora.util.SpaceExceededException; +import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.util.ByteBuffer; +import de.anomic.data.ymark.YMarkUtil; + +public class TablesColumnBLOBIndex extends TablesColumnIndex{ + + // Map>> + private final BEncodedHeap index; + private final static byte SEPERATOR = (byte) ','; + + public TablesColumnBLOBIndex(final BEncodedHeap bheap) { + super(TablesColumnIndex.INDEXTYPE.BLOB); + this.index = bheap; + } + + public static Collection byteToCollection(final byte[] b) { + final Collection PKset = ByteBuffer.split(b, SEPERATOR); + return PKset; + } + + public static byte[] CollectionToByte(final Collection bc) { + final ByteBuffer buf = new ByteBuffer(15 * bc.size()); + final Iterator iter = bc.iterator(); + while(iter.hasNext()) { + buf.append(iter.next()); + buf.append(SEPERATOR); + } + return buf.getBytes(); + } + + public void deleteIndex(final String columnName) { + final byte[] column = YMarkUtil.getKeyId(columnName); + try { + this.index.remove(column); + } catch (IOException e) { + Log.logException(e); + } catch (SpaceExceededException e) { + Log.logException(e); + } + } + + protected void insertPK(final String columnName, final String columnValue, final byte[] pk) { + Map valueIdxMap; + Collection PKset; + final byte[] column = YMarkUtil.getKeyId(columnName); + try { + valueIdxMap = this.index.get(column); + if(valueIdxMap != null) { + if(valueIdxMap.containsKey(columnValue)) { + PKset = byteToCollection(valueIdxMap.get(columnValue)); + if(!ByteBuffer.contains(PKset, pk)) { + PKset.add(pk); + } + } else { + PKset = new ArrayList(1); + PKset.add(pk); + valueIdxMap.put(columnValue, CollectionToByte(PKset)); + } + } else { + PKset = new ArrayList(1); + PKset.add(pk); + valueIdxMap = new ConcurrentHashMap(); + } + valueIdxMap.put(columnValue, CollectionToByte(PKset)); + this.index.insert(column, valueIdxMap); + return; + } catch (IOException e) { + Log.logException(e); + } catch (SpaceExceededException e) { + Log.logException(e); + } + } + + protected void removePK(final byte[] pk) { + final Iterator>> niter = this.index.iterator(); + while (niter.hasNext()) { + final Map.Entry> entry = niter.next(); + final Iterator> viter = entry.getValue().entrySet().iterator(); + while(viter.hasNext()) { + final Map.Entry columnValue = viter.next(); + final Collection PKset = byteToCollection(columnValue.getValue()); + ByteBuffer.remove(PKset, pk); + if(PKset.isEmpty()) { + viter.remove(); + } else { + columnValue.setValue(CollectionToByte(PKset)); + } + } + try { + this.index.insert(entry.getKey(), entry.getValue()); + } catch (SpaceExceededException e) { + Log.logException(e); + } catch (IOException e) { + Log.logException(e); + } + } + } + + public void clear() { + this.index.clear(); + } + + public Collection columns() { + return this.index.columns(); + } + + public Set keySet(final String columnName) { + final byte[] column = YMarkUtil.getKeyId(columnName); + // a TreeSet is used to get sorted set of keys (e.g. folders) + if(this.index.containsKey(column)) { + try { + return new TreeSet(this.index.get(column).keySet()); + } catch (IOException e) { + Log.logException(e); + } catch (SpaceExceededException e) { + Log.logException(e); + } + } + return new TreeSet(); + } + + public boolean containsKey(final String columnName, final String key) { + final byte[] column = YMarkUtil.getKeyId(columnName); + if(this.index.containsKey(column)) { + try { + return this.index.get(column).containsKey(key); + } catch (IOException e) { + Log.logException(e); + } catch (SpaceExceededException e) { + Log.logException(e); + } + } + return false; + } + + public boolean hasIndex(final String columnName) { + final byte[] column = YMarkUtil.getKeyId(columnName); + return this.index.containsKey(column); + } + + public Collection get(final String columnName, final String key) { + final byte[] column = YMarkUtil.getKeyId(columnName); + // deserialize + try { + return byteToCollection(this.index.get(column).get(key)); + } catch (IOException e) { + Log.logException(e); + } catch (SpaceExceededException e) { + Log.logException(e); + } + return new ArrayList(); + } + + public int size(final String columnName) { + final byte[] column = YMarkUtil.getKeyId(columnName); + if(this.index.containsKey(column)) { + try { + return this.index.get(column).size(); + } catch (IOException e) { + Log.logException(e); + } catch (SpaceExceededException e) { + Log.logException(e); + } + } + return -1; + } + + public int size() { + return this.index.size(); + } +} \ No newline at end of file diff --git a/source/net/yacy/kelondro/blob/TablesColumnIndex.java b/source/net/yacy/kelondro/blob/TablesColumnIndex.java new file mode 100644 index 000000000..07ee6e5d0 --- /dev/null +++ b/source/net/yacy/kelondro/blob/TablesColumnIndex.java @@ -0,0 +1,176 @@ +// TablesColumnIndex.java +// (C) 2012 by Stefan Foerster, sof@gmx.de, Norderstedt, Germany +// first published 2012 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.kelondro.blob; + +import java.util.Collection; +import java.util.Comparator; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; + +import net.yacy.cora.document.UTF8; +import net.yacy.kelondro.order.NaturalOrder; + + +/** + * a mapping from a column name to maps with the value of the columns to the primary keys where the entry exist in the table + */ +public abstract class TablesColumnIndex { + + public static enum INDEXTYPE {RAM, BLOB} + private INDEXTYPE type; + // Map>> + // private final Map>> index; + + protected final static Comparator NATURALORDER = new NaturalOrder(true); + + protected abstract void insertPK(final String columnName, final String columnValue, final byte[] pk); + protected abstract void removePK(final byte[] pk); + protected abstract void clear(); + + public abstract Set keySet(final String columnName); + public abstract boolean containsKey(final String columnName, final String key); + public abstract boolean hasIndex(final String columnName); + public abstract Collection get(final String columnName, final String key); + public abstract int size(final String columnName); + public abstract int size(); + public abstract Collection columns(); + public abstract void deleteIndex(final String columnName); + + public TablesColumnIndex(INDEXTYPE type) { + this.type = type; + } + + public INDEXTYPE getType() { + return this.type; + } + + /** + * create an index for a given table and column + * @param columnName - name of the column you want to build an index for + * @param valueIsArray - indicates whether the column value consist of an array (e.g. comma separated tags) + * @param separator - a string value used to split column values into an array + * @param table - an iterator over table rows which should be added to the index + */ + public synchronized void buildIndex(final String columnName, final String separator, final Iterator table) { + this.deleteIndex(columnName); + // loop through all rows of the table + while (table.hasNext()) { + this.add(columnName, separator, table.next()); + } + } + + /** + * create an index for a given table and column + * @param columnName - name of the column you want to build an index for + * @param table - an iterator over table rows which should be added to the index + */ + public synchronized void buildIndex(final String columnName, final Iterator table) { + this.buildIndex(columnName, "", table); + } + + /** + * create an index for a given table and given columns + * @param columns - a map of column names and booleans for 'valueIsArray' you want to build an index for + * @param separator - a string value used to split column values into an array + * @param table - an iterator over table rows which should be added to the index + */ + public synchronized void buildIndex(final Map columns, final Iterator table) { + this.clear(); + // loop through all rows of the table + while (table.hasNext()) { + this.add(columns, table.next()); + } + } + + private void insertPK(final String columnName, final String[] columnValues, final byte[] pk) { + for (String columnValue : columnValues) { + this.insertPK(columnName, columnValue, pk); + } + } + + public void delete(final byte[] pk) { + this.removePK(pk); + } + + public void delete(final Tables.Row row) { + this.removePK(row.getPK()); + } + + public void update(final String columnName, final String separator, final Tables.Row row) { + this.removePK(row.getPK()); + this.add(columnName, separator, row); + } + + public void update(final Map columns, final Tables.Row row) { + this.removePK(row.getPK()); + this.add(columns, row); + } + + public void add(final String columnName, final String separator, final Map map, final byte[] pk) { + if(separator.isEmpty()) + this.insertPK(columnName, map.get(columnName), pk); + else + this.insertPK(columnName, map.get(columnName).split(separator), pk); + } + + public void add(final String columnName, final String separator, final Tables.Data row, final byte[] pk) { + if(separator.isEmpty()) + this.insertPK(columnName, UTF8.String(row.get(columnName)), pk); + else + this.insertPK(columnName, UTF8.String(row.get(columnName)).split(separator), pk); + } + + public void add(final String columnName, final String separator, final Tables.Row row) { + if(separator.isEmpty()) + this.insertPK(columnName, UTF8.String(row.get(columnName)), row.getPK()); + else + this.insertPK(columnName, UTF8.String(row.get(columnName)).split(separator), row.getPK()); + } + + public void add(final Map columns, final Map map, final byte[] pk) { + final Iterator iter = columns.keySet().iterator(); + while (iter.hasNext()) { + final String columnName = iter.next(); + if(columns.get(columnName).isEmpty()) + this.insertPK(columnName, map.get(columnName), pk); + else + this.insertPK(columnName, map.get(columnName).split(columns.get(columnName)), pk); + } + } + + public void add(final Map columns, final Tables.Data row, final byte[] pk) { + final Iterator iter = columns.keySet().iterator(); + while (iter.hasNext()) { + final String columnName = iter.next(); + if(columns.get(columnName).isEmpty()) + this.insertPK(columnName, UTF8.String(row.get(columnName)), pk); + else + this.insertPK(columnName, UTF8.String(row.get(columnName)).split(columns.get(columnName)), pk); + } + } + + public void add(final Map columns, final Tables.Row row) { + this.add(columns, row, row.getPK()); + } +} \ No newline at end of file diff --git a/source/net/yacy/kelondro/blob/TablesColumnRAMIndex.java b/source/net/yacy/kelondro/blob/TablesColumnRAMIndex.java new file mode 100644 index 000000000..e3affdda1 --- /dev/null +++ b/source/net/yacy/kelondro/blob/TablesColumnRAMIndex.java @@ -0,0 +1,124 @@ +// TablesColumnRAMIndex.java +// (C) 2012 by Stefan Foerster, sof@gmx.de, Norderstedt, Germany +// first published 2012 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.kelondro.blob; + +import java.util.Collection; +import java.util.Comparator; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import java.util.concurrent.ConcurrentHashMap; + +import net.yacy.kelondro.order.NaturalOrder; + +public class TablesColumnRAMIndex extends TablesColumnIndex{ + + // Map>> + private final Map>> index; + + private final static Comparator NATURALORDER = new NaturalOrder(true); + + public TablesColumnRAMIndex() { + super(TablesColumnIndex.INDEXTYPE.RAM); + this.index = new ConcurrentHashMap>>(); + } + + public void deleteIndex(final String columnName) { + this.index.remove(columnName); + } + + protected void insertPK(final String columnName, final String columnValue, final byte[] pk) { + Map> valueIdxMap; + TreeSet PKset; + if(this.index.containsKey(columnName)) { + valueIdxMap = this.index.get(columnName); + } + else { + valueIdxMap = new ConcurrentHashMap>(); + this.index.put(columnName, valueIdxMap); + } + if(valueIdxMap.containsKey(columnValue)) { + PKset = valueIdxMap.get(columnValue); + } + else { + PKset = new TreeSet(NATURALORDER); + valueIdxMap.put(columnValue, PKset); + } + PKset.add(pk); + } + + protected synchronized void removePK(final byte[] pk) { + for(Map.Entry>> columnName : this.index.entrySet()) { + final Iterator>> viter = columnName.getValue().entrySet().iterator(); + while(viter.hasNext()) { + final Map.Entry> columnValue = viter.next(); + columnValue.getValue().remove(pk); + if(columnValue.getValue().isEmpty()) + viter.remove(); + } + } + } + + public void clear() { + this.index.clear(); + } + + public Collection columns() { + return this.index.keySet(); + } + + public Set keySet(final String columnName) { + // a TreeSet is used to get sorted set of keys (e.g. folders) + if(this.index.containsKey(columnName)) { + return new TreeSet(this.index.get(columnName).keySet()); + } + return new TreeSet(); + } + + public boolean containsKey(final String columnName, final String columnValue) { + if(this.index.containsKey(columnName)) { + return this.index.get(columnName).containsKey(columnValue); + } + return false; + } + + public boolean hasIndex(final String columnName) { + return this.index.containsKey(columnName); + } + + public Collection get(final String columnName, final String key) { + return this.index.get(columnName).get(key); + } + + public int size(final String columnName) { + if(this.index.containsKey(columnName)) { + return this.index.get(columnName).size(); + } + return -1; + } + + public int size() { + return this.index.size(); + } +} \ No newline at end of file From 59bd478ed179f366ef52f6a34f7084bcda6ac48f Mon Sep 17 00:00:00 2001 From: apfelmaennchen Date: Sun, 9 Sep 2012 22:56:24 +0200 Subject: [PATCH 2/2] Added more sophisticated RDF output for YMarks, including the folder structure (b:Topic) and support for multiple tags (dc:subject) and folders (b:hasTopic) via rdf:Bag container. --- htroot/YMarks.java | 27 +++++---- source/de/anomic/data/ymark/YMarkRDF.java | 73 +++++++++++++++++++---- 2 files changed, 74 insertions(+), 26 deletions(-) diff --git a/htroot/YMarks.java b/htroot/YMarks.java index 193cf6f84..d8d9cbd86 100644 --- a/htroot/YMarks.java +++ b/htroot/YMarks.java @@ -29,18 +29,19 @@ public class YMarks { YMarkRDF rdf = new YMarkRDF("http://"+sb.peers.myAlternativeAddress()); if(post != null && post.containsKey(YMarkEntry.BOOKMARKS_ID)) { - final String id = post.get(YMarkEntry.BOOKMARKS_ID); - final int i = id.indexOf(':'); - final String bmk_user = id.substring(0,i); - final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); - final byte[] urlHash = UTF8.getBytes(id.substring(i+1, id.length())); - Tables.Row bmk_row; - try { - bmk_row = sb.tables.select(bmk_table, urlHash); - rdf.addBookmark(bmk_user, bmk_row); - } catch (IOException e) { - } catch (SpaceExceededException e) { - } + final String id[] = post.get(YMarkEntry.BOOKMARKS_ID).split(":"); + if(id[1].equals("b")) { + final String bmk_user = id[0]; + final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); + final byte[] urlHash = UTF8.getBytes(id[2]); + Tables.Row bmk_row; + try { + bmk_row = sb.tables.select(bmk_table, urlHash); + rdf.addBookmark(bmk_user, bmk_row); + } catch (IOException e) { + } catch (SpaceExceededException e) { + } + } } else { final Iterator iter = sb.tables.iterator(); while(iter.hasNext()) { @@ -57,7 +58,7 @@ public class YMarks { } } } - prop.put("rdf", rdf.getRDF("RDF/XML")); + prop.put("rdf", rdf.getRDF("RDF/XML-ABBREV")); return prop; } if(isAdmin || isAuthUser) { diff --git a/source/de/anomic/data/ymark/YMarkRDF.java b/source/de/anomic/data/ymark/YMarkRDF.java index f3bdadb65..79095a06b 100644 --- a/source/de/anomic/data/ymark/YMarkRDF.java +++ b/source/de/anomic/data/ymark/YMarkRDF.java @@ -13,6 +13,7 @@ import net.yacy.cora.lod.vocabulary.DCElements; import net.yacy.cora.lod.vocabulary.Rdf; import net.yacy.kelondro.blob.Tables; +import com.hp.hpl.jena.rdf.model.Bag; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; import com.hp.hpl.jena.rdf.model.Property; @@ -24,9 +25,11 @@ public class YMarkRDF { public final static String USER = "USER"; public final static String TYPE = "TYPE"; + public final static String SUBTOPIC = "SUBTOPIC"; + private final Map property; - public final static String BOOKMARK = "/Ymarks.rdf?id="; + public final static String BOOKMARK = "/YMarks.rdf?id="; private final StringBuilder resourceURI; private final int len; @@ -54,6 +57,7 @@ public class YMarkRDF { this.property.put(USER, this.model.createProperty(DCElements.creator.getNamespace(), DCElements.creator.name())); this.property.put(TYPE, this.model.createProperty(Rdf.type.getNamespace(), Rdf.type.name())); + this.property.put(SUBTOPIC, this.model.createProperty(AnnoteaB.subTopicOf.getNamespace(), AnnoteaB.subTopicOf.name())); } /** @@ -70,40 +74,83 @@ public class YMarkRDF { } } + public void addTopic(final String bmk_user, final String folder) { + this.resourceURI.append(bmk_user); + this.resourceURI.append(":f:"); + this.resourceURI.append(UTF8.String(YMarkUtil.getKeyId(folder))); + final Resource topic = this.model.createResource(this.resourceURI.toString()); + this.resourceURI.setLength(this.len); + + topic.addProperty(this.property.get(YMarkEntry.BOOKMARK.DATE_MODIFIED.key()), YMarkUtil.EMPTY_STRING); + topic.addProperty(this.property.get(YMarkEntry.BOOKMARK.DATE_ADDED.key()), YMarkUtil.EMPTY_STRING); + topic.addProperty(this.property.get(USER), bmk_user); + topic.addProperty(this.property.get(YMarkEntry.BOOKMARK.DESC.key()), YMarkUtil.EMPTY_STRING); + final int i = folder.lastIndexOf(YMarkUtil.FOLDERS_SEPARATOR); + if(i>0) + topic.addProperty(this.property.get(SUBTOPIC), folder.substring(0, i)); + topic.addProperty(this.property.get(YMarkEntry.BOOKMARK.TITLE.key()), folder); + topic.addProperty(this.property.get(TYPE), AnnoteaB.Topic.getPredicate()); + } + public void addBookmark (final String bmk_user, final Tables.Row bmk_row) { if(bmk_row == null || bmk_row.get(YMarkEntry.BOOKMARK.PUBLIC.key(), YMarkEntry.BOOKMARK.PUBLIC.deflt()).equals("false")) - return; - final Resource bmk; + return; // create an annotea bookmark resource this.resourceURI.append(bmk_user); - this.resourceURI.append(':'); + this.resourceURI.append(":b:"); this.resourceURI.append(UTF8.String(bmk_row.getPK())); - bmk = this.model.createResource(this.resourceURI.toString()); + final Resource bmk = this.model.createResource(this.resourceURI.toString()); this.resourceURI.setLength(this.len); // add properties - bmk.addProperty(this.property.get(TYPE), AnnoteaB.Bookmark.getPredicate()); bmk.addProperty(this.property.get(USER), bmk_user); - for (final YMarkEntry.BOOKMARK b : YMarkEntry.BOOKMARK.values()) { - switch(b) { - case FOLDERS: - final String[] folders = bmk_row.get(b.key(), b.deflt()).split(YMarkUtil.TAGS_SEPARATOR); - for(String folder : folders) { - bmk.addProperty(this.property.get(b.key()), folder); - // TODO add Topics to RDF + for (final YMarkEntry.BOOKMARK b : YMarkEntry.BOOKMARK.values()) { + switch(b) { + case FOLDERS: + final String[] folders = bmk_row.get(b.key(), b.deflt()).split(b.seperator()); + if(folders.length > 1) { + Bag topics = this.model.createBag(); + for(String folder : folders) { + topics.add(folder); + this.addTopic(bmk_user, folder); + } + bmk.addProperty(this.property.get(b.key()), topics); + } else { + bmk.addProperty(this.property.get(b.key()), folders[0]); + this.addTopic(bmk_user, folders[0]); + } + break; + case TAGS: + final String[] tags = bmk_row.get(b.key(), b.deflt()).split(b.seperator()); + if(tags.length > 1) { + Bag subjects = this.model.createBag(); + for(String tag : tags) { + subjects.add(tag); + } + bmk.addProperty(this.property.get(b.key()), subjects); + } else { + bmk.addProperty(this.property.get(b.key()), tags[0]); } break; + case DATE_ADDED: case DATE_MODIFIED: final YMarkDate date = new YMarkDate(bmk_row.get(b.key())); bmk.addProperty(this.property.get(b.key()), date.toISO8601()); break; + // these cases are inserted for better readable RDF output + case DESC: + case URL: + case TITLE: + bmk.addProperty(this.property.get(b.key()), bmk_row.get(b.key(), b.deflt())); + break; default: if(this.property.containsKey(b.key())) { bmk.addProperty(this.property.get(b.key()), bmk_row.get(b.key(), b.deflt())); } } } + bmk.addProperty(this.property.get(TYPE), AnnoteaB.Bookmark.getPredicate()); } public void addBookmarks(final String bmk_user, final Iterator riter) {