another step towards an auto tagging function for YMarks

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7337 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
apfelmaennchen 14 years ago
parent dad5818b40
commit 442497868d

@ -1,18 +1,19 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.EnumMap;
import java.util.Iterator;
import java.util.List;
import java.util.TreeMap;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import de.anomic.data.YMarkKeyValueEntry;
import de.anomic.data.YMarkTables;
import de.anomic.data.UserDB;
import de.anomic.data.YMarkTables;
import de.anomic.data.YMarkTables.METADATA;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
@ -163,7 +164,7 @@ public class get_treeview {
prop.put("folders_"+count+"_hash", "m:"+url);
prop.put("folders_"+count+"_hasChildren", "true");
count++;
prop.put("folders_"+count+"_foldername","<small><b>WordCount</b></small>");
prop.put("folders_"+count+"_foldername","<small><b>WordCounts</b></small>");
putProp(count, "meta");
prop.put("folders_"+count+"_hash", "w:"+url);
prop.put("folders_"+count+"_hasChildren", "true");
@ -178,11 +179,12 @@ public class get_treeview {
}
} else if (isWordCount) {
try {
final List<YMarkKeyValueEntry<String, Integer>> list = YMarkTables.getWordFrequencies(post.get(ROOT).substring(2), sb.loader, 10);
final Iterator<YMarkKeyValueEntry<String, Integer>> iter = list.iterator();
while (iter.hasNext()) {
YMarkKeyValueEntry<String, Integer> e = iter.next();
prop.put("folders_"+count+"_foldername","<small><b>"+e.getKey()+":</b> [" + e.getValue() + "]</small>");
final TreeMap<String,Word> words = YMarkTables.getWordCounts(post.get(ROOT).substring(2), sb.loader);
final ArrayList<String> topwords = new ArrayList<String>(words.descendingKeySet());
for(int i = 0; i < 20 && i < topwords.size(); i++) {
String word = topwords.get(i);
int occur = words.get(word).occurrences();
prop.put("folders_"+count+"_foldername","<small><b>"+word+":</b> [" + occur + "]</small>");
putProp(count, "meta");
count++;
}
@ -208,8 +210,8 @@ public class get_treeview {
putProp(count, "meta");
count++;
}
count--;
prop.put("folders_"+count+"_comma", "");
prop.put("folders_"+count+"_foldername","<small><b>autotag:</b> " + sb.tables.bookmarks.autoTag(post.get(ROOT).substring(2), sb.loader, bmk_user, 5) + "</small>");
putProp(count, "meta");
count++;
prop.put("folders", count);
} catch (MalformedURLException e) {

@ -1,66 +0,0 @@
package de.anomic.data;
/**
* @author apfelmaennchen
*
* @param <K>
* @param <V>
*/
public class YMarkKeyValueEntry<K extends Comparable<K>,V extends Comparable<V>> extends Object implements Comparable<YMarkKeyValueEntry<K,V>> {
private K key;
private V value;
public YMarkKeyValueEntry() {
this.key = null;
this.value = null;
}
public YMarkKeyValueEntry(K key, V value) {
this.key = key;
this.value = value;
}
/**
* The natural order of objects in this class is determind by their value components<br/>
* <b>Note:</b> this class has a natural ordering that is inconsistent with equals.
* @see java.lang.Comparable#compareTo(java.lang.Object)
*/
public int compareTo(YMarkKeyValueEntry<K,V> e) {
return this.value.compareTo(e.value);
}
/**
* Two objects of this class are considered to be equal, if their keys are equal.<br/>
* <b>Note:</b> this class has a natural ordering that is inconsistent with equals.
*/
@SuppressWarnings("unchecked")
@Override
public boolean equals(Object obj) {
if(this.getClass() == obj.getClass())
return this.key.equals(((YMarkKeyValueEntry<K,V>)obj).getKey());
else return false;
}
public K getKey() {
return this.key;
}
public V getValue() {
return this.value;
}
public void setValue(V value) {
this.value = value;
}
public void setKey(K key) {
this.key = key;
}
public void set(K key, V value) {
this.key = key;
this.value = value;
}
}

@ -1,12 +1,11 @@
package de.anomic.data;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.EnumMap;
import java.util.EnumSet;
@ -14,8 +13,9 @@ import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.Parser.Failure;
@ -31,6 +31,7 @@ import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response;
import de.anomic.search.Segment;
import de.anomic.data.YMarkWordCountComparator;
public class YMarkTables {
@ -449,7 +450,7 @@ public class YMarkTables {
if(document != null) {
metadata.put(METADATA.TITLE, document.dc_title());
metadata.put(METADATA.CREATOR, document.dc_creator());
metadata.put(METADATA.KEYWORDS, document.dc_subject(','));
metadata.put(METADATA.KEYWORDS, document.dc_subject(' '));
metadata.put(METADATA.PUBLISHER, document.dc_publisher());
metadata.put(METADATA.DESCRIPTION, document.dc_description());
metadata.put(METADATA.MIMETYPE, document.dc_format());
@ -465,52 +466,88 @@ public class YMarkTables {
return metadata;
}
public static List<YMarkKeyValueEntry<String, Integer>> getWordFrequencies(final String url, final LoaderDispatcher loader, final int top) throws MalformedURLException {
final List<YMarkKeyValueEntry<String, Integer>> list = new ArrayList<YMarkKeyValueEntry<String, Integer>>();
public String autoTag(final String url, final LoaderDispatcher loader, final String bmk_user, final int count) throws MalformedURLException {
final StringBuilder buffer = new StringBuilder();
final Map<String, Word> words;
final DigestURI u = new DigestURI(url);
Response response = null;
try {
response = loader.load(loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE);
final Document document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
if(document != null) {
try {
words = new Condenser(document, true, true, LibraryProvider.dymLib).words();
buffer.append(document.dc_title());
buffer.append(document.dc_description());
buffer.append(document.dc_subject(' '));
final Enumeration<String> tokens = Condenser.wordTokenizer(buffer.toString(), "UTF-8", LibraryProvider.dymLib);
while(tokens.hasMoreElements()) {
int max = 1;
String token = tokens.nextElement();
Word word = words.get(token);
if (words.containsKey(token)) {
if (this.worktables.has(TABLES.TAGS.tablename(bmk_user), getKeyId(token))) {
max = word.occurrences() * 1000;
} else if (token.length()>3) {
max = word.occurrences() * 100;
}
for(int i=0; i<max; i++) {
word.inc();
}
}
}
buffer.setLength(0);
final ArrayList<String> topwords = new ArrayList<String>(sortWordCounts(words).descendingKeySet());
for(int i=0; i<count && i<topwords.size() ; i++) {
if(words.get(topwords.get(i)).occurrences() > 100) {
buffer.append(topwords.get(i));
/*
buffer.append('[');
buffer.append(words.get(topwords.get(i)).occurrences());
buffer.append(']');
*/
buffer.append(',');
}
}
if(buffer.length() > 0) {
buffer.deleteCharAt(buffer.length()-1);
}
} catch (UnsupportedEncodingException e) {
Log.logException(e);
} catch (IOException e) {
Log.logException(e);
}
}
} catch (IOException e) {
Log.logException(e);
} catch (Failure e) {
Log.logException(e);
}
return buffer.toString();
}
public static TreeMap<String,Word> getWordCounts(final String url, final LoaderDispatcher loader) throws MalformedURLException {
final DigestURI u = new DigestURI(url);
Response response = null;
int wordcount = 0;
String sentence, token;
final YMarkKeyValueEntry<String, Integer> entry = new YMarkKeyValueEntry<String, Integer>();
try {
response = loader.load(loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE);
final Document document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
if(document != null) {
final Collection<StringBuilder> sentences = document.getSentences(false);
if (sentences != null) {
for (StringBuilder s: sentences) {
sentence = s.toString();
Enumeration<String> tokens = Condenser.wordTokenizer(sentence, "UTF-8", LibraryProvider.dymLib);
while (tokens.hasMoreElements()) {
token = tokens.nextElement();
if (token.length() > 2) {
wordcount++;
entry.set(token.toLowerCase(), 1);
if(list.contains(entry)) {
int v = list.get(list.indexOf(entry)).getValue() + 1;
list.get(list.indexOf(entry)).setValue(v);
} else {
list.add(new YMarkKeyValueEntry<String, Integer>(token.toLowerCase(), 1));
}
}
}
}
}
document.close();
return sortWordCounts(new Condenser(document, true, true, LibraryProvider.dymLib).words());
}
} catch (IOException e) {
Log.logException(e);
} catch (Failure e) {
Log.logException(e);
}
Collections.sort(list);
float c = list.size();
Log.logInfo(YMarkTables.BOOKMARKS_LOG, "size: "+c);
int end = (int) (c*0.9);
int start = end - top;
if (start < 0)
start = 0;
Log.logInfo(YMarkTables.BOOKMARKS_LOG, "start: "+start+" end: "+end);
return list.subList(start,end);
return new TreeMap<String, Word>();
}
public static TreeMap<String,Word> sortWordCounts(final Map<String, Word> unsorted_words) {
final TreeMap<String, Word> sorted_words = new TreeMap<String, Word>(new YMarkWordCountComparator(unsorted_words));
sorted_words.putAll(unsorted_words);
return sorted_words;
}
}

@ -0,0 +1,27 @@
package de.anomic.data;
import java.util.Comparator;
import java.util.Map;
import net.yacy.kelondro.data.word.Word;
public class YMarkWordCountComparator implements Comparator<String> {
private Map<String,Word> words;
public YMarkWordCountComparator(final Map<String,Word> words) {
this.words = words;
}
public int compare(final String k1, final String k2) {
final Word w1 = this.words.get(k1);
final Word w2 = this.words.get(k2);
if(w1.occurrences() > w2.occurrences())
return 1;
else if(w1.occurrences() < w2.occurrences())
return -1;
else
return 0;
}
}
Loading…
Cancel
Save