diff --git a/htroot/api/ymarks/get_treeview.java b/htroot/api/ymarks/get_treeview.java index 33411844a..510f8fcb1 100644 --- a/htroot/api/ymarks/get_treeview.java +++ b/htroot/api/ymarks/get_treeview.java @@ -1,18 +1,19 @@ import java.io.IOException; import java.net.MalformedURLException; +import java.util.ArrayList; import java.util.Date; import java.util.EnumMap; import java.util.Iterator; -import java.util.List; +import java.util.TreeMap; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.blob.Tables; +import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.DateFormatter; -import de.anomic.data.YMarkKeyValueEntry; -import de.anomic.data.YMarkTables; import de.anomic.data.UserDB; +import de.anomic.data.YMarkTables; import de.anomic.data.YMarkTables.METADATA; import de.anomic.search.Segments; import de.anomic.search.Switchboard; @@ -163,7 +164,7 @@ public class get_treeview { prop.put("folders_"+count+"_hash", "m:"+url); prop.put("folders_"+count+"_hasChildren", "true"); count++; - prop.put("folders_"+count+"_foldername","WordCount"); + prop.put("folders_"+count+"_foldername","WordCounts"); putProp(count, "meta"); prop.put("folders_"+count+"_hash", "w:"+url); prop.put("folders_"+count+"_hasChildren", "true"); @@ -178,11 +179,12 @@ public class get_treeview { } } else if (isWordCount) { try { - final List> list = YMarkTables.getWordFrequencies(post.get(ROOT).substring(2), sb.loader, 10); - final Iterator> iter = list.iterator(); - while (iter.hasNext()) { - YMarkKeyValueEntry e = iter.next(); - prop.put("folders_"+count+"_foldername",""+e.getKey()+": [" + e.getValue() + "]"); + final TreeMap words = YMarkTables.getWordCounts(post.get(ROOT).substring(2), sb.loader); + final ArrayList topwords = new ArrayList(words.descendingKeySet()); + for(int i = 0; i < 20 && i < topwords.size(); i++) { + String word = topwords.get(i); + int occur = words.get(word).occurrences(); + prop.put("folders_"+count+"_foldername",""+word+": [" + occur + "]"); putProp(count, "meta"); count++; } @@ -208,8 +210,8 @@ public class get_treeview { putProp(count, "meta"); count++; } - count--; - prop.put("folders_"+count+"_comma", ""); + prop.put("folders_"+count+"_foldername","autotag: " + sb.tables.bookmarks.autoTag(post.get(ROOT).substring(2), sb.loader, bmk_user, 5) + ""); + putProp(count, "meta"); count++; prop.put("folders", count); } catch (MalformedURLException e) { diff --git a/source/de/anomic/data/YMarkKeyValueEntry.java b/source/de/anomic/data/YMarkKeyValueEntry.java deleted file mode 100644 index 7c70caa4d..000000000 --- a/source/de/anomic/data/YMarkKeyValueEntry.java +++ /dev/null @@ -1,66 +0,0 @@ -package de.anomic.data; - -/** - * @author apfelmaennchen - * - * @param - * @param - */ -public class YMarkKeyValueEntry,V extends Comparable> extends Object implements Comparable> { - - private K key; - private V value; - - - public YMarkKeyValueEntry() { - this.key = null; - this.value = null; - } - - public YMarkKeyValueEntry(K key, V value) { - this.key = key; - this.value = value; - } - - /** - * The natural order of objects in this class is determind by their value components
- * Note: this class has a natural ordering that is inconsistent with equals. - * @see java.lang.Comparable#compareTo(java.lang.Object) - */ - public int compareTo(YMarkKeyValueEntry e) { - return this.value.compareTo(e.value); - } - - /** - * Two objects of this class are considered to be equal, if their keys are equal.
- * Note: this class has a natural ordering that is inconsistent with equals. - */ - @SuppressWarnings("unchecked") - @Override - public boolean equals(Object obj) { - if(this.getClass() == obj.getClass()) - return this.key.equals(((YMarkKeyValueEntry)obj).getKey()); - else return false; - } - - public K getKey() { - return this.key; - } - - public V getValue() { - return this.value; - } - - public void setValue(V value) { - this.value = value; - } - - public void setKey(K key) { - this.key = key; - } - - public void set(K key, V value) { - this.key = key; - this.value = value; - } -} diff --git a/source/de/anomic/data/YMarkTables.java b/source/de/anomic/data/YMarkTables.java index 9057f3baa..fab6fcf56 100644 --- a/source/de/anomic/data/YMarkTables.java +++ b/source/de/anomic/data/YMarkTables.java @@ -1,12 +1,11 @@ package de.anomic.data; import java.io.IOException; +import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; import java.util.Date; import java.util.EnumMap; import java.util.EnumSet; @@ -14,8 +13,9 @@ import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; -import java.util.List; import java.util.Map; +import java.util.TreeMap; + import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.Parser.Failure; @@ -31,6 +31,7 @@ import net.yacy.repository.LoaderDispatcher; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.retrieval.Response; import de.anomic.search.Segment; +import de.anomic.data.YMarkWordCountComparator; public class YMarkTables { @@ -449,7 +450,7 @@ public class YMarkTables { if(document != null) { metadata.put(METADATA.TITLE, document.dc_title()); metadata.put(METADATA.CREATOR, document.dc_creator()); - metadata.put(METADATA.KEYWORDS, document.dc_subject(',')); + metadata.put(METADATA.KEYWORDS, document.dc_subject(' ')); metadata.put(METADATA.PUBLISHER, document.dc_publisher()); metadata.put(METADATA.DESCRIPTION, document.dc_description()); metadata.put(METADATA.MIMETYPE, document.dc_format()); @@ -465,52 +466,88 @@ public class YMarkTables { return metadata; } - public static List> getWordFrequencies(final String url, final LoaderDispatcher loader, final int top) throws MalformedURLException { - final List> list = new ArrayList>(); - final DigestURI u = new DigestURI(url); + public String autoTag(final String url, final LoaderDispatcher loader, final String bmk_user, final int count) throws MalformedURLException { + final StringBuilder buffer = new StringBuilder(); + final Map words; + final DigestURI u = new DigestURI(url); Response response = null; - int wordcount = 0; - String sentence, token; - final YMarkKeyValueEntry entry = new YMarkKeyValueEntry(); try { response = loader.load(loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE); final Document document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); if(document != null) { - final Collection sentences = document.getSentences(false); - if (sentences != null) { - for (StringBuilder s: sentences) { - sentence = s.toString(); - Enumeration tokens = Condenser.wordTokenizer(sentence, "UTF-8", LibraryProvider.dymLib); - while (tokens.hasMoreElements()) { - token = tokens.nextElement(); - if (token.length() > 2) { - wordcount++; - entry.set(token.toLowerCase(), 1); - if(list.contains(entry)) { - int v = list.get(list.indexOf(entry)).getValue() + 1; - list.get(list.indexOf(entry)).setValue(v); - } else { - list.add(new YMarkKeyValueEntry(token.toLowerCase(), 1)); - } - } - } - } - } - document.close(); + try { + words = new Condenser(document, true, true, LibraryProvider.dymLib).words(); + buffer.append(document.dc_title()); + buffer.append(document.dc_description()); + buffer.append(document.dc_subject(' ')); + final Enumeration tokens = Condenser.wordTokenizer(buffer.toString(), "UTF-8", LibraryProvider.dymLib); + while(tokens.hasMoreElements()) { + int max = 1; + String token = tokens.nextElement(); + Word word = words.get(token); + if (words.containsKey(token)) { + if (this.worktables.has(TABLES.TAGS.tablename(bmk_user), getKeyId(token))) { + max = word.occurrences() * 1000; + } else if (token.length()>3) { + max = word.occurrences() * 100; + } + for(int i=0; i topwords = new ArrayList(sortWordCounts(words).descendingKeySet()); + for(int i=0; i 100) { + buffer.append(topwords.get(i)); + /* + buffer.append('['); + buffer.append(words.get(topwords.get(i)).occurrences()); + buffer.append(']'); + */ + buffer.append(','); + } + } + if(buffer.length() > 0) { + buffer.deleteCharAt(buffer.length()-1); + } + + } catch (UnsupportedEncodingException e) { + Log.logException(e); + } catch (IOException e) { + Log.logException(e); + } + } + } catch (IOException e) { + Log.logException(e); + } catch (Failure e) { + Log.logException(e); + } + return buffer.toString(); + } + + public static TreeMap getWordCounts(final String url, final LoaderDispatcher loader) throws MalformedURLException { + final DigestURI u = new DigestURI(url); + Response response = null; + try { + response = loader.load(loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE); + final Document document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); + if(document != null) { + return sortWordCounts(new Condenser(document, true, true, LibraryProvider.dymLib).words()); } - } catch (IOException e) { + } catch (IOException e) { Log.logException(e); } catch (Failure e) { Log.logException(e); } - Collections.sort(list); - float c = list.size(); - Log.logInfo(YMarkTables.BOOKMARKS_LOG, "size: "+c); - int end = (int) (c*0.9); - int start = end - top; - if (start < 0) - start = 0; - Log.logInfo(YMarkTables.BOOKMARKS_LOG, "start: "+start+" end: "+end); - return list.subList(start,end); + return new TreeMap(); } + + public static TreeMap sortWordCounts(final Map unsorted_words) { + final TreeMap sorted_words = new TreeMap(new YMarkWordCountComparator(unsorted_words)); + sorted_words.putAll(unsorted_words); + return sorted_words; + } + } diff --git a/source/de/anomic/data/YMarkWordCountComparator.java b/source/de/anomic/data/YMarkWordCountComparator.java new file mode 100644 index 000000000..6505bd846 --- /dev/null +++ b/source/de/anomic/data/YMarkWordCountComparator.java @@ -0,0 +1,27 @@ +package de.anomic.data; + +import java.util.Comparator; +import java.util.Map; + +import net.yacy.kelondro.data.word.Word; + +public class YMarkWordCountComparator implements Comparator { + + private Map words; + + public YMarkWordCountComparator(final Map words) { + this.words = words; + } + + public int compare(final String k1, final String k2) { + final Word w1 = this.words.get(k1); + final Word w2 = this.words.get(k2); + + if(w1.occurrences() > w2.occurrences()) + return 1; + else if(w1.occurrences() < w2.occurrences()) + return -1; + else + return 0; + } +}