added a drill-down for metadata and word count to /api/ymarks/test_treeview.html

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7324 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
apfelmaennchen 14 years ago
parent a025b1da89
commit 403ee9c014

@ -216,14 +216,12 @@ public class Table_YMark_p {
count = 0;
try {
Iterator<Tables.Row> mapIterator;
if(post.containsKey("folders") && !post.get("folders").isEmpty()) {
mapIterator = sb.tables.bookmarks.folders.getBookmarks(bmk_user, post.get("folders"));
mapIterator = sb.tables.orderByPK(sb.tables.bookmarks.folders.getBookmarks(bmk_user, post.get("folders")), maxcount).iterator();
} else if(post.containsKey("tags") && !post.get("tags").isEmpty()) {
mapIterator = sb.tables.bookmarks.tags.getBookmarks(bmk_user, post.get("tags"));
mapIterator = sb.tables.orderByPK(sb.tables.bookmarks.tags.getBookmarks(bmk_user, post.get("tags")), maxcount).iterator();
} else {
final Iterator<Tables.Row> plainIterator = sb.tables.iterator(table, matcher);
mapIterator = sb.tables.orderByPK(plainIterator, maxcount).iterator();
mapIterator = sb.tables.orderByPK(sb.tables.iterator(table, matcher), maxcount).iterator();
}
Tables.Row row;

@ -1,7 +1,9 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.EnumMap;
import java.util.Iterator;
import java.util.Map;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.index.RowSpaceExceededException;
@ -9,6 +11,8 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import de.anomic.data.YMarkTables;
import de.anomic.data.userDB;
import de.anomic.data.YMarkTables.METADATA;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -26,14 +30,17 @@ public class get_treeview {
final userDB.Entry user = sb.userDB.getUser(header);
final boolean isAdmin = (sb.verifyAuthentication(header, true));
final boolean isAuthUser = user!= null && user.hasRight(userDB.Entry.BOOKMARK_RIGHT);
if(isAdmin || isAuthUser) {
final String bmk_user = (isAuthUser ? user.getUserName() : YMarkTables.USER_ADMIN);
String root = YMarkTables.FOLDERS_ROOT;
String[] foldername = null;
boolean isFolder = true;
boolean isBookmark = false;
boolean isMetadata = false;
boolean isWordCount = false;
if (post != null){
if (post.containsKey(ROOT)) {
@ -41,8 +48,14 @@ public class get_treeview {
root = "";
} else if (post.get(ROOT).startsWith(YMarkTables.FOLDERS_ROOT)) {
root = post.get(ROOT);
} else {
// root = YMarkTables.FOLDERS_ROOT + post.get(ROOT);
} else if (post.get(ROOT).startsWith("b:")) {
isBookmark = true;
isFolder = false;
} else if (post.get(ROOT).startsWith("m:")) {
isMetadata = true;
isFolder = false;
} else if (post.get(ROOT).startsWith("w:")) {
isWordCount = true;
isFolder = false;
}
}
@ -99,7 +112,7 @@ public class get_treeview {
prop.put("folders_"+count+"_expanded", "false");
prop.put("folders_"+count+"_url", url);
prop.put("folders_"+count+"_type", "file");
prop.put("folders_"+count+"_hash", urlHash);
prop.put("folders_"+count+"_hash", "b:"+urlHash);
prop.put("folders_"+count+"_hasChildren", "true");
prop.put("folders_"+count+"_comma", ",");
count++;
@ -114,9 +127,11 @@ public class get_treeview {
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
} else {
} else if(isBookmark) {
try {
bmk_row = sb.tables.select(YMarkTables.TABLES.BOOKMARKS.tablename(bmk_user), post.get(ROOT).getBytes());
final String urlHash = post.get(ROOT).substring(2);
String url = "";
bmk_row = sb.tables.select(YMarkTables.TABLES.BOOKMARKS.tablename(bmk_user), urlHash.getBytes());
if(bmk_row != null) {
it = bmk_row.keySet().iterator();
while(it.hasNext()) {
@ -131,6 +146,8 @@ public class get_treeview {
}
} else {
final String value = new String(bmk_row.get(key));
if (key.equals("url"))
url = value;
prop.put("folders_"+count+"_foldername","<small><b>"+key+":</b> " + value + "</small>");
if(YMarkTables.BOOKMARK.contains(key))
putProp(count, YMarkTables.BOOKMARK.get(key).type());
@ -139,9 +156,17 @@ public class get_treeview {
count++;
}
}
count--;
prop.put("folders_"+count+"_comma", "");
count++;
prop.put("folders_"+count+"_foldername","<small><b>MetaData</b></small>");
putProp(count, "meta");
prop.put("folders_"+count+"_hash", "m:"+url);
prop.put("folders_"+count+"_hasChildren", "true");
count++;
prop.put("folders_"+count+"_foldername","<small><b>WordCount</b></small>");
putProp(count, "meta");
prop.put("folders_"+count+"_hash", "w:"+url);
prop.put("folders_"+count+"_hasChildren", "true");
prop.put("folders_"+count+"_comma", "");
count++;
prop.put("folders", count);
}
} catch (IOException e) {
@ -149,6 +174,48 @@ public class get_treeview {
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
} else if (isWordCount) {
try {
final Map<String, Integer> words = YMarkTables.getWordFrequencies(post.get(ROOT).substring(2), sb.loader);
final Iterator<String> iter = words.keySet().iterator();
while (iter.hasNext()) {
String key = iter.next();
int value = words.get(key);
if(value > 5 && value < 15) {
prop.put("folders_"+count+"_foldername","<small><b>"+key+":</b> [" + value + "]</small>");
putProp(count, "meta");
count++;
}
}
count--;
prop.put("folders_"+count+"_comma", "");
count++;
prop.put("folders", count);
} catch (MalformedURLException e) {
Log.logException(e);
}
} else if (isMetadata) {
try {
final String url = post.get(ROOT).substring(2);
EnumMap<METADATA, String> metadata;
metadata = YMarkTables.getMetadata(YMarkTables.getBookmarkId(url), sb.indexSegments.segment(Segments.Process.PUBLIC));
if (metadata.isEmpty())
metadata = YMarkTables.loadMetadata(url, sb.loader);
final Iterator<METADATA> iter = metadata.keySet().iterator();
while (iter.hasNext()) {
final METADATA key = iter.next();
final String value = metadata.get(key);
prop.put("folders_"+count+"_foldername","<small><b>"+key.toString().toLowerCase()+":</b> " + value + "</small>");
putProp(count, "meta");
count++;
}
count--;
prop.put("folders_"+count+"_comma", "");
count++;
prop.put("folders", count);
} catch (MalformedURLException e) {
Log.logException(e);
}
}
} else {
prop.put(YMarkTables.USER_AUTHENTICATE,YMarkTables.USER_AUTHENTICATE_MSG);

@ -4,19 +4,30 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Date;
import java.util.EnumMap;
import java.util.EnumSet;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.Parser.Failure;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.blob.Tables.Data;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response;
import de.anomic.search.Segment;
public class YMarkTables {
@ -122,6 +133,25 @@ public class YMarkTables {
}
}
public enum METADATA {
TITLE,
DESCRIPTION,
FAVICON,
KEYWORDS,
LANGUAGE,
CREATOR,
PUBLISHER,
CHARSET,
MIMETYPE,
SIZE,
WORDCOUNT,
IN_URLDB,
FRESHDATE,
LOADDATE,
MODDATE,
SNIPPET
}
public final static HashMap<String,String> POISON = new HashMap<String,String>();
public final static String TAGS_SEPARATOR = ",";
@ -379,4 +409,96 @@ public class YMarkTables {
}
}
}
public static EnumMap<METADATA, String> getMetadata(final byte[] urlHash, final Segment indexSegment) throws MalformedURLException {
final EnumMap<METADATA, String> metadata = new EnumMap<METADATA, String>(METADATA.class);
final URIMetadataRow urlEntry = indexSegment.urlMetadata().load(urlHash, null, 0);
if (urlEntry != null) {
metadata.put(METADATA.IN_URLDB, "true");
metadata.put(METADATA.SIZE, String.valueOf(urlEntry.size()));
metadata.put(METADATA.FRESHDATE, DateFormatter.formatISO8601(urlEntry.freshdate()));
metadata.put(METADATA.LOADDATE, DateFormatter.formatISO8601(urlEntry.loaddate()));
metadata.put(METADATA.MODDATE, DateFormatter.formatISO8601(urlEntry.moddate()));
metadata.put(METADATA.SNIPPET, String.valueOf(urlEntry.snippet()));
metadata.put(METADATA.WORDCOUNT, String.valueOf(urlEntry.wordCount()));
metadata.put(METADATA.MIMETYPE, String.valueOf(urlEntry.doctype()));
metadata.put(METADATA.LANGUAGE, urlEntry.language());
final URIMetadataRow.Components meta = urlEntry.metadata();
if (meta != null) {
metadata.put(METADATA.TITLE, meta.dc_title());
metadata.put(METADATA.CREATOR, meta.dc_creator());
metadata.put(METADATA.KEYWORDS, meta.dc_subject());
metadata.put(METADATA.PUBLISHER, meta.dc_publisher());
}
}
return metadata;
}
public static EnumMap<METADATA, String> loadMetadata(final String url, final LoaderDispatcher loader) throws MalformedURLException {
final EnumMap<METADATA, String> metadata = new EnumMap<METADATA, String>(METADATA.class);
metadata.put(METADATA.IN_URLDB, "false");
final DigestURI u = new DigestURI(url);
Response response = null;
try {
response = loader.load(loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE);
final Document document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
if(document != null) {
metadata.put(METADATA.TITLE, document.dc_title());
metadata.put(METADATA.CREATOR, document.dc_creator());
metadata.put(METADATA.KEYWORDS, document.dc_subject(','));
metadata.put(METADATA.PUBLISHER, document.dc_publisher());
metadata.put(METADATA.DESCRIPTION, document.dc_description());
metadata.put(METADATA.MIMETYPE, document.dc_format());
metadata.put(METADATA.LANGUAGE, document.dc_language());
metadata.put(METADATA.CHARSET, document.getCharset());
metadata.put(METADATA.SIZE, String.valueOf(document.getTextLength()));
}
} catch (IOException e) {
Log.logException(e);
} catch (Failure e) {
Log.logException(e);
}
return metadata;
}
public static Map<String, Integer> getWordFrequencies(final String url, final LoaderDispatcher loader) throws MalformedURLException {
final Map<String,Integer> words = new HashMap<String,Integer>();
final DigestURI u = new DigestURI(url);
Response response = null;
int wordcount = 0;
String sentence, token;
try {
response = loader.load(loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE);
final Document document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
if(document != null) {
final Collection<StringBuilder> sentences = document.getSentences(false);
if (sentences != null) {
for (StringBuilder s: sentences) {
sentence = s.toString();
Enumeration<String> tokens = Condenser.wordTokenizer(sentence, "UTF-8", LibraryProvider.dymLib);
while (tokens.hasMoreElements()) {
token = tokens.nextElement();
if (token.length() > 2) {
wordcount++;
if(words.containsKey(token)) {
int count = words.get(token);
count++;
words.put(token, count);
} else {
words.put(token, 1);
}
}
}
}
}
document.close();
}
} catch (IOException e) {
Log.logException(e);
} catch (Failure e) {
Log.logException(e);
}
return words;
}
}

Loading…
Cancel
Save