From 403ee9c0140270584561d7bd149f650bbc2f1dfb Mon Sep 17 00:00:00 2001 From: apfelmaennchen Date: Tue, 16 Nov 2010 00:48:38 +0000 Subject: [PATCH] added a drill-down for metadata and word count to /api/ymarks/test_treeview.html git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7324 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Table_YMark_p.java | 8 +- htroot/api/ymarks/get_treeview.java | 89 +++++++++++++++--- source/de/anomic/data/YMarkTables.java | 124 ++++++++++++++++++++++++- 3 files changed, 204 insertions(+), 17 deletions(-) diff --git a/htroot/Table_YMark_p.java b/htroot/Table_YMark_p.java index 170e4ef7e..c81cafab1 100644 --- a/htroot/Table_YMark_p.java +++ b/htroot/Table_YMark_p.java @@ -216,14 +216,12 @@ public class Table_YMark_p { count = 0; try { Iterator mapIterator; - if(post.containsKey("folders") && !post.get("folders").isEmpty()) { - mapIterator = sb.tables.bookmarks.folders.getBookmarks(bmk_user, post.get("folders")); + mapIterator = sb.tables.orderByPK(sb.tables.bookmarks.folders.getBookmarks(bmk_user, post.get("folders")), maxcount).iterator(); } else if(post.containsKey("tags") && !post.get("tags").isEmpty()) { - mapIterator = sb.tables.bookmarks.tags.getBookmarks(bmk_user, post.get("tags")); + mapIterator = sb.tables.orderByPK(sb.tables.bookmarks.tags.getBookmarks(bmk_user, post.get("tags")), maxcount).iterator(); } else { - final Iterator plainIterator = sb.tables.iterator(table, matcher); - mapIterator = sb.tables.orderByPK(plainIterator, maxcount).iterator(); + mapIterator = sb.tables.orderByPK(sb.tables.iterator(table, matcher), maxcount).iterator(); } Tables.Row row; diff --git a/htroot/api/ymarks/get_treeview.java b/htroot/api/ymarks/get_treeview.java index 37a0f90b4..e7bf2f8a3 100644 --- a/htroot/api/ymarks/get_treeview.java +++ b/htroot/api/ymarks/get_treeview.java @@ -1,7 +1,9 @@ import java.io.IOException; +import java.net.MalformedURLException; import java.util.Date; +import java.util.EnumMap; import java.util.Iterator; - +import java.util.Map; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.index.RowSpaceExceededException; @@ -9,6 +11,8 @@ import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.DateFormatter; import de.anomic.data.YMarkTables; import de.anomic.data.userDB; +import de.anomic.data.YMarkTables.METADATA; +import de.anomic.search.Segments; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -26,14 +30,17 @@ public class get_treeview { final userDB.Entry user = sb.userDB.getUser(header); final boolean isAdmin = (sb.verifyAuthentication(header, true)); final boolean isAuthUser = user!= null && user.hasRight(userDB.Entry.BOOKMARK_RIGHT); + if(isAdmin || isAuthUser) { final String bmk_user = (isAuthUser ? user.getUserName() : YMarkTables.USER_ADMIN); - - + String root = YMarkTables.FOLDERS_ROOT; String[] foldername = null; boolean isFolder = true; + boolean isBookmark = false; + boolean isMetadata = false; + boolean isWordCount = false; if (post != null){ if (post.containsKey(ROOT)) { @@ -41,8 +48,14 @@ public class get_treeview { root = ""; } else if (post.get(ROOT).startsWith(YMarkTables.FOLDERS_ROOT)) { root = post.get(ROOT); - } else { - // root = YMarkTables.FOLDERS_ROOT + post.get(ROOT); + } else if (post.get(ROOT).startsWith("b:")) { + isBookmark = true; + isFolder = false; + } else if (post.get(ROOT).startsWith("m:")) { + isMetadata = true; + isFolder = false; + } else if (post.get(ROOT).startsWith("w:")) { + isWordCount = true; isFolder = false; } } @@ -99,7 +112,7 @@ public class get_treeview { prop.put("folders_"+count+"_expanded", "false"); prop.put("folders_"+count+"_url", url); prop.put("folders_"+count+"_type", "file"); - prop.put("folders_"+count+"_hash", urlHash); + prop.put("folders_"+count+"_hash", "b:"+urlHash); prop.put("folders_"+count+"_hasChildren", "true"); prop.put("folders_"+count+"_comma", ","); count++; @@ -114,9 +127,11 @@ public class get_treeview { } catch (RowSpaceExceededException e) { Log.logException(e); } - } else { + } else if(isBookmark) { try { - bmk_row = sb.tables.select(YMarkTables.TABLES.BOOKMARKS.tablename(bmk_user), post.get(ROOT).getBytes()); + final String urlHash = post.get(ROOT).substring(2); + String url = ""; + bmk_row = sb.tables.select(YMarkTables.TABLES.BOOKMARKS.tablename(bmk_user), urlHash.getBytes()); if(bmk_row != null) { it = bmk_row.keySet().iterator(); while(it.hasNext()) { @@ -131,6 +146,8 @@ public class get_treeview { } } else { final String value = new String(bmk_row.get(key)); + if (key.equals("url")) + url = value; prop.put("folders_"+count+"_foldername",""+key+": " + value + ""); if(YMarkTables.BOOKMARK.contains(key)) putProp(count, YMarkTables.BOOKMARK.get(key).type()); @@ -139,9 +156,17 @@ public class get_treeview { count++; } } - count--; - prop.put("folders_"+count+"_comma", ""); - count++; + prop.put("folders_"+count+"_foldername","MetaData"); + putProp(count, "meta"); + prop.put("folders_"+count+"_hash", "m:"+url); + prop.put("folders_"+count+"_hasChildren", "true"); + count++; + prop.put("folders_"+count+"_foldername","WordCount"); + putProp(count, "meta"); + prop.put("folders_"+count+"_hash", "w:"+url); + prop.put("folders_"+count+"_hasChildren", "true"); + prop.put("folders_"+count+"_comma", ""); + count++; prop.put("folders", count); } } catch (IOException e) { @@ -149,6 +174,48 @@ public class get_treeview { } catch (RowSpaceExceededException e) { Log.logException(e); } + } else if (isWordCount) { + try { + final Map words = YMarkTables.getWordFrequencies(post.get(ROOT).substring(2), sb.loader); + final Iterator iter = words.keySet().iterator(); + while (iter.hasNext()) { + String key = iter.next(); + int value = words.get(key); + if(value > 5 && value < 15) { + prop.put("folders_"+count+"_foldername",""+key+": [" + value + "]"); + putProp(count, "meta"); + count++; + } + } + count--; + prop.put("folders_"+count+"_comma", ""); + count++; + prop.put("folders", count); + } catch (MalformedURLException e) { + Log.logException(e); + } + } else if (isMetadata) { + try { + final String url = post.get(ROOT).substring(2); + EnumMap metadata; + metadata = YMarkTables.getMetadata(YMarkTables.getBookmarkId(url), sb.indexSegments.segment(Segments.Process.PUBLIC)); + if (metadata.isEmpty()) + metadata = YMarkTables.loadMetadata(url, sb.loader); + final Iterator iter = metadata.keySet().iterator(); + while (iter.hasNext()) { + final METADATA key = iter.next(); + final String value = metadata.get(key); + prop.put("folders_"+count+"_foldername",""+key.toString().toLowerCase()+": " + value + ""); + putProp(count, "meta"); + count++; + } + count--; + prop.put("folders_"+count+"_comma", ""); + count++; + prop.put("folders", count); + } catch (MalformedURLException e) { + Log.logException(e); + } } } else { prop.put(YMarkTables.USER_AUTHENTICATE,YMarkTables.USER_AUTHENTICATE_MSG); diff --git a/source/de/anomic/data/YMarkTables.java b/source/de/anomic/data/YMarkTables.java index f1f290a6d..9f50ea6e3 100644 --- a/source/de/anomic/data/YMarkTables.java +++ b/source/de/anomic/data/YMarkTables.java @@ -4,19 +4,30 @@ import java.io.IOException; import java.net.MalformedURLException; import java.text.ParseException; import java.text.SimpleDateFormat; +import java.util.Collection; import java.util.Date; +import java.util.EnumMap; import java.util.EnumSet; +import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; - +import net.yacy.document.Condenser; +import net.yacy.document.Document; +import net.yacy.document.Parser.Failure; import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.blob.Tables.Data; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowSpaceExceededException; +import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.DateFormatter; +import net.yacy.repository.LoaderDispatcher; +import de.anomic.crawler.CrawlProfile; +import de.anomic.crawler.retrieval.Response; +import de.anomic.search.Segment; public class YMarkTables { @@ -122,6 +133,25 @@ public class YMarkTables { } } + public enum METADATA { + TITLE, + DESCRIPTION, + FAVICON, + KEYWORDS, + LANGUAGE, + CREATOR, + PUBLISHER, + CHARSET, + MIMETYPE, + SIZE, + WORDCOUNT, + IN_URLDB, + FRESHDATE, + LOADDATE, + MODDATE, + SNIPPET + } + public final static HashMap POISON = new HashMap(); public final static String TAGS_SEPARATOR = ","; @@ -379,4 +409,96 @@ public class YMarkTables { } } } + + public static EnumMap getMetadata(final byte[] urlHash, final Segment indexSegment) throws MalformedURLException { + final EnumMap metadata = new EnumMap(METADATA.class); + final URIMetadataRow urlEntry = indexSegment.urlMetadata().load(urlHash, null, 0); + if (urlEntry != null) { + metadata.put(METADATA.IN_URLDB, "true"); + metadata.put(METADATA.SIZE, String.valueOf(urlEntry.size())); + metadata.put(METADATA.FRESHDATE, DateFormatter.formatISO8601(urlEntry.freshdate())); + metadata.put(METADATA.LOADDATE, DateFormatter.formatISO8601(urlEntry.loaddate())); + metadata.put(METADATA.MODDATE, DateFormatter.formatISO8601(urlEntry.moddate())); + metadata.put(METADATA.SNIPPET, String.valueOf(urlEntry.snippet())); + metadata.put(METADATA.WORDCOUNT, String.valueOf(urlEntry.wordCount())); + metadata.put(METADATA.MIMETYPE, String.valueOf(urlEntry.doctype())); + metadata.put(METADATA.LANGUAGE, urlEntry.language()); + + final URIMetadataRow.Components meta = urlEntry.metadata(); + if (meta != null) { + metadata.put(METADATA.TITLE, meta.dc_title()); + metadata.put(METADATA.CREATOR, meta.dc_creator()); + metadata.put(METADATA.KEYWORDS, meta.dc_subject()); + metadata.put(METADATA.PUBLISHER, meta.dc_publisher()); + } + } + return metadata; + } + + public static EnumMap loadMetadata(final String url, final LoaderDispatcher loader) throws MalformedURLException { + final EnumMap metadata = new EnumMap(METADATA.class); + metadata.put(METADATA.IN_URLDB, "false"); + final DigestURI u = new DigestURI(url); + Response response = null; + try { + response = loader.load(loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE); + final Document document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); + if(document != null) { + metadata.put(METADATA.TITLE, document.dc_title()); + metadata.put(METADATA.CREATOR, document.dc_creator()); + metadata.put(METADATA.KEYWORDS, document.dc_subject(',')); + metadata.put(METADATA.PUBLISHER, document.dc_publisher()); + metadata.put(METADATA.DESCRIPTION, document.dc_description()); + metadata.put(METADATA.MIMETYPE, document.dc_format()); + metadata.put(METADATA.LANGUAGE, document.dc_language()); + metadata.put(METADATA.CHARSET, document.getCharset()); + metadata.put(METADATA.SIZE, String.valueOf(document.getTextLength())); + } + } catch (IOException e) { + Log.logException(e); + } catch (Failure e) { + Log.logException(e); + } + return metadata; + } + + public static Map getWordFrequencies(final String url, final LoaderDispatcher loader) throws MalformedURLException { + final Map words = new HashMap(); + final DigestURI u = new DigestURI(url); + Response response = null; + int wordcount = 0; + String sentence, token; + try { + response = loader.load(loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE); + final Document document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); + if(document != null) { + final Collection sentences = document.getSentences(false); + if (sentences != null) { + for (StringBuilder s: sentences) { + sentence = s.toString(); + Enumeration tokens = Condenser.wordTokenizer(sentence, "UTF-8", LibraryProvider.dymLib); + while (tokens.hasMoreElements()) { + token = tokens.nextElement(); + if (token.length() > 2) { + wordcount++; + if(words.containsKey(token)) { + int count = words.get(token); + count++; + words.put(token, count); + } else { + words.put(token, 1); + } + } + } + } + } + document.close(); + } + } catch (IOException e) { + Log.logException(e); + } catch (Failure e) { + Log.logException(e); + } + return words; + } }