intermediate step for a YMark auto-tagging function based on word frequencies.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7325 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
apfelmaennchen 15 years ago
parent 403ee9c014
commit 54e63b556e

@ -3,12 +3,14 @@ import java.net.MalformedURLException;
import java.util.Date;
import java.util.EnumMap;
import java.util.Iterator;
import java.util.Map;
import java.util.List;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import de.anomic.data.YMarkKeyValueEntry;
import de.anomic.data.YMarkTables;
import de.anomic.data.userDB;
import de.anomic.data.YMarkTables.METADATA;
@ -176,16 +178,13 @@ public class get_treeview {
}
} else if (isWordCount) {
try {
final Map<String, Integer> words = YMarkTables.getWordFrequencies(post.get(ROOT).substring(2), sb.loader);
final Iterator<String> iter = words.keySet().iterator();
final List<YMarkKeyValueEntry<String, Integer>> list = YMarkTables.getWordFrequencies(post.get(ROOT).substring(2), sb.loader, 10);
final Iterator<YMarkKeyValueEntry<String, Integer>> iter = list.iterator();
while (iter.hasNext()) {
String key = iter.next();
int value = words.get(key);
if(value > 5 && value < 15) {
prop.put("folders_"+count+"_foldername","<small><b>"+key+":</b> [" + value + "]</small>");
putProp(count, "meta");
count++;
}
YMarkKeyValueEntry<String, Integer> e = iter.next();
prop.put("folders_"+count+"_foldername","<small><b>"+e.getKey()+":</b> [" + e.getValue() + "]</small>");
putProp(count, "meta");
count++;
}
count--;
prop.put("folders_"+count+"_comma", "");

@ -0,0 +1,66 @@
package de.anomic.data;
/**
* @author apfelmaennchen
*
* @param <K>
* @param <V>
*/
public class YMarkKeyValueEntry<K extends Comparable<K>,V extends Comparable<V>> extends Object implements Comparable<YMarkKeyValueEntry<K,V>> {
private K key;
private V value;
public YMarkKeyValueEntry() {
this.key = null;
this.value = null;
}
public YMarkKeyValueEntry(K key, V value) {
this.key = key;
this.value = value;
}
/**
* The natural order of objects in this class is determind by their value components<br/>
* <b>Note:</b> this class has a natural ordering that is inconsistent with equals.
* @see java.lang.Comparable#compareTo(java.lang.Object)
*/
public int compareTo(YMarkKeyValueEntry<K,V> e) {
return this.value.compareTo(e.value);
}
/**
* Two objects of this class are considered to be equal, if their keys are equal.<br/>
* <b>Note:</b> this class has a natural ordering that is inconsistent with equals.
*/
@SuppressWarnings("unchecked")
@Override
public boolean equals(Object obj) {
if(this.getClass() == obj.getClass())
return this.key.equals(((YMarkKeyValueEntry<K,V>)obj).getKey());
else return false;
}
public K getKey() {
return this.key;
}
public V getValue() {
return this.value;
}
public void setValue(V value) {
this.value = value;
}
public void setKey(K key) {
this.key = key;
}
public void set(K key, V value) {
this.key = key;
this.value = value;
}
}

@ -4,7 +4,9 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.EnumMap;
import java.util.EnumSet;
@ -12,6 +14,7 @@ import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
@ -452,7 +455,7 @@ public class YMarkTables {
metadata.put(METADATA.MIMETYPE, document.dc_format());
metadata.put(METADATA.LANGUAGE, document.dc_language());
metadata.put(METADATA.CHARSET, document.getCharset());
metadata.put(METADATA.SIZE, String.valueOf(document.getTextLength()));
// metadata.put(METADATA.SIZE, String.valueOf(document.getTextLength()));
}
} catch (IOException e) {
Log.logException(e);
@ -462,12 +465,13 @@ public class YMarkTables {
return metadata;
}
public static Map<String, Integer> getWordFrequencies(final String url, final LoaderDispatcher loader) throws MalformedURLException {
final Map<String,Integer> words = new HashMap<String,Integer>();
final DigestURI u = new DigestURI(url);
public static List<YMarkKeyValueEntry<String, Integer>> getWordFrequencies(final String url, final LoaderDispatcher loader, final int top) throws MalformedURLException {
final List<YMarkKeyValueEntry<String, Integer>> list = new ArrayList<YMarkKeyValueEntry<String, Integer>>();
final DigestURI u = new DigestURI(url);
Response response = null;
int wordcount = 0;
String sentence, token;
final YMarkKeyValueEntry<String, Integer> entry = new YMarkKeyValueEntry<String, Integer>();
try {
response = loader.load(loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE);
final Document document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
@ -481,12 +485,12 @@ public class YMarkTables {
token = tokens.nextElement();
if (token.length() > 2) {
wordcount++;
if(words.containsKey(token)) {
int count = words.get(token);
count++;
words.put(token, count);
entry.set(token.toLowerCase(), 1);
if(list.contains(entry)) {
int v = list.get(list.indexOf(entry)).getValue() + 1;
list.get(list.indexOf(entry)).setValue(v);
} else {
words.put(token, 1);
list.add(new YMarkKeyValueEntry<String, Integer>(token.toLowerCase(), 1));
}
}
}
@ -499,6 +503,14 @@ public class YMarkTables {
} catch (Failure e) {
Log.logException(e);
}
return words;
Collections.sort(list);
float c = list.size();
Log.logInfo(YMarkTables.BOOKMARKS_LOG, "size: "+c);
int end = (int) (c*0.9);
int start = end - top;
if (start < 0)
start = 0;
Log.logInfo(YMarkTables.BOOKMARKS_LOG, "start: "+start+" end: "+end);
return list.subList(start,end);
}
}

Loading…
Cancel
Save