intermediate step for a YMark auto-tagging function based on word frequencies.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7325 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
apfelmaennchen 15 years ago
parent 403ee9c014
commit 54e63b556e

@ -3,12 +3,14 @@ import java.net.MalformedURLException;
import java.util.Date; import java.util.Date;
import java.util.EnumMap; import java.util.EnumMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.List;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.DateFormatter;
import de.anomic.data.YMarkKeyValueEntry;
import de.anomic.data.YMarkTables; import de.anomic.data.YMarkTables;
import de.anomic.data.userDB; import de.anomic.data.userDB;
import de.anomic.data.YMarkTables.METADATA; import de.anomic.data.YMarkTables.METADATA;
@ -176,16 +178,13 @@ public class get_treeview {
} }
} else if (isWordCount) { } else if (isWordCount) {
try { try {
final Map<String, Integer> words = YMarkTables.getWordFrequencies(post.get(ROOT).substring(2), sb.loader); final List<YMarkKeyValueEntry<String, Integer>> list = YMarkTables.getWordFrequencies(post.get(ROOT).substring(2), sb.loader, 10);
final Iterator<String> iter = words.keySet().iterator(); final Iterator<YMarkKeyValueEntry<String, Integer>> iter = list.iterator();
while (iter.hasNext()) { while (iter.hasNext()) {
String key = iter.next(); YMarkKeyValueEntry<String, Integer> e = iter.next();
int value = words.get(key); prop.put("folders_"+count+"_foldername","<small><b>"+e.getKey()+":</b> [" + e.getValue() + "]</small>");
if(value > 5 && value < 15) { putProp(count, "meta");
prop.put("folders_"+count+"_foldername","<small><b>"+key+":</b> [" + value + "]</small>"); count++;
putProp(count, "meta");
count++;
}
} }
count--; count--;
prop.put("folders_"+count+"_comma", ""); prop.put("folders_"+count+"_comma", "");

@ -0,0 +1,66 @@
package de.anomic.data;
/**
* @author apfelmaennchen
*
* @param <K>
* @param <V>
*/
public class YMarkKeyValueEntry<K extends Comparable<K>,V extends Comparable<V>> extends Object implements Comparable<YMarkKeyValueEntry<K,V>> {
private K key;
private V value;
public YMarkKeyValueEntry() {
this.key = null;
this.value = null;
}
public YMarkKeyValueEntry(K key, V value) {
this.key = key;
this.value = value;
}
/**
* The natural order of objects in this class is determind by their value components<br/>
* <b>Note:</b> this class has a natural ordering that is inconsistent with equals.
* @see java.lang.Comparable#compareTo(java.lang.Object)
*/
public int compareTo(YMarkKeyValueEntry<K,V> e) {
return this.value.compareTo(e.value);
}
/**
* Two objects of this class are considered to be equal, if their keys are equal.<br/>
* <b>Note:</b> this class has a natural ordering that is inconsistent with equals.
*/
@SuppressWarnings("unchecked")
@Override
public boolean equals(Object obj) {
if(this.getClass() == obj.getClass())
return this.key.equals(((YMarkKeyValueEntry<K,V>)obj).getKey());
else return false;
}
public K getKey() {
return this.key;
}
public V getValue() {
return this.value;
}
public void setValue(V value) {
this.value = value;
}
public void setKey(K key) {
this.key = key;
}
public void set(K key, V value) {
this.key = key;
this.value = value;
}
}

@ -4,7 +4,9 @@ import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.text.ParseException; import java.text.ParseException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.Collections;
import java.util.Date; import java.util.Date;
import java.util.EnumMap; import java.util.EnumMap;
import java.util.EnumSet; import java.util.EnumSet;
@ -12,6 +14,7 @@ import java.util.Enumeration;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.List;
import java.util.Map; import java.util.Map;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.document.Document; import net.yacy.document.Document;
@ -452,7 +455,7 @@ public class YMarkTables {
metadata.put(METADATA.MIMETYPE, document.dc_format()); metadata.put(METADATA.MIMETYPE, document.dc_format());
metadata.put(METADATA.LANGUAGE, document.dc_language()); metadata.put(METADATA.LANGUAGE, document.dc_language());
metadata.put(METADATA.CHARSET, document.getCharset()); metadata.put(METADATA.CHARSET, document.getCharset());
metadata.put(METADATA.SIZE, String.valueOf(document.getTextLength())); // metadata.put(METADATA.SIZE, String.valueOf(document.getTextLength()));
} }
} catch (IOException e) { } catch (IOException e) {
Log.logException(e); Log.logException(e);
@ -462,12 +465,13 @@ public class YMarkTables {
return metadata; return metadata;
} }
public static Map<String, Integer> getWordFrequencies(final String url, final LoaderDispatcher loader) throws MalformedURLException { public static List<YMarkKeyValueEntry<String, Integer>> getWordFrequencies(final String url, final LoaderDispatcher loader, final int top) throws MalformedURLException {
final Map<String,Integer> words = new HashMap<String,Integer>(); final List<YMarkKeyValueEntry<String, Integer>> list = new ArrayList<YMarkKeyValueEntry<String, Integer>>();
final DigestURI u = new DigestURI(url); final DigestURI u = new DigestURI(url);
Response response = null; Response response = null;
int wordcount = 0; int wordcount = 0;
String sentence, token; String sentence, token;
final YMarkKeyValueEntry<String, Integer> entry = new YMarkKeyValueEntry<String, Integer>();
try { try {
response = loader.load(loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE); response = loader.load(loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE);
final Document document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); final Document document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
@ -481,12 +485,12 @@ public class YMarkTables {
token = tokens.nextElement(); token = tokens.nextElement();
if (token.length() > 2) { if (token.length() > 2) {
wordcount++; wordcount++;
if(words.containsKey(token)) { entry.set(token.toLowerCase(), 1);
int count = words.get(token); if(list.contains(entry)) {
count++; int v = list.get(list.indexOf(entry)).getValue() + 1;
words.put(token, count); list.get(list.indexOf(entry)).setValue(v);
} else { } else {
words.put(token, 1); list.add(new YMarkKeyValueEntry<String, Integer>(token.toLowerCase(), 1));
} }
} }
} }
@ -499,6 +503,14 @@ public class YMarkTables {
} catch (Failure e) { } catch (Failure e) {
Log.logException(e); Log.logException(e);
} }
return words; Collections.sort(list);
float c = list.size();
Log.logInfo(YMarkTables.BOOKMARKS_LOG, "size: "+c);
int end = (int) (c*0.9);
int start = end - top;
if (start < 0)
start = 0;
Log.logInfo(YMarkTables.BOOKMARKS_LOG, "start: "+start+" end: "+end);
return list.subList(start,end);
} }
} }

Loading…
Cancel
Save