|
|
|
@ -4,13 +4,13 @@ import java.io.ByteArrayInputStream;
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
import java.net.MalformedURLException;
|
|
|
|
|
import java.util.Arrays;
|
|
|
|
|
import java.util.Enumeration;
|
|
|
|
|
import java.util.HashSet;
|
|
|
|
|
import java.util.Iterator;
|
|
|
|
|
import java.util.Map;
|
|
|
|
|
import java.util.TreeMap;
|
|
|
|
|
import java.util.TreeSet;
|
|
|
|
|
import java.util.concurrent.ArrayBlockingQueue;
|
|
|
|
|
|
|
|
|
|
import net.yacy.cora.document.UTF8;
|
|
|
|
|
import net.yacy.cora.services.federated.yacy.CacheStrategy;
|
|
|
|
|
import net.yacy.document.Condenser;
|
|
|
|
@ -28,11 +28,11 @@ import de.anomic.crawler.retrieval.Response;
|
|
|
|
|
public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandler {
|
|
|
|
|
|
|
|
|
|
private static final String EMPTY_STRING = new String();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public final static String SPACE = " ";
|
|
|
|
|
public final static String POISON = "";
|
|
|
|
|
public final static HashSet<String> stopwords = new HashSet<String>(Arrays.asList(".", "!", "?", "nbsp", "uuml", "ouml", "auml", "amp", "quot", "laquo", "raquo",
|
|
|
|
|
public final static HashSet<String> stopwords = new HashSet<String>(Arrays.asList(".", "!", "?", "nbsp", "uuml", "ouml", "auml", "amp", "quot", "laquo", "raquo",
|
|
|
|
|
"and", "with", "the", "gt", "lt"));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -81,10 +81,10 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public static String autoTag(final Document document, final int max, final TreeMap<String, YMarkTag> tags) {
|
|
|
|
|
final TreeSet<YMarkTag> topwords = new TreeSet<YMarkTag>();
|
|
|
|
|
StringBuilder token;
|
|
|
|
|
StringBuilder token;
|
|
|
|
|
|
|
|
|
|
if(document == null) {
|
|
|
|
|
return EMPTY_STRING;
|
|
|
|
@ -92,7 +92,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
|
|
|
|
|
|
|
|
|
|
//get words from document
|
|
|
|
|
final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib).words();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// generate potential tags from document title, description and subject
|
|
|
|
|
final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32;
|
|
|
|
|
final StringBuilder buffer = new StringBuilder(bufferSize);
|
|
|
|
@ -103,21 +103,21 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
|
|
|
|
|
final WordTokenizer tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);
|
|
|
|
|
try {
|
|
|
|
|
int score = 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// get phrases
|
|
|
|
|
final TreeMap<String, YMarkTag> phrases = getPhrases(document, 2);
|
|
|
|
|
phrases.putAll(getPhrases(document, 3));
|
|
|
|
|
final Iterator<String> iter = phrases.keySet().iterator();
|
|
|
|
|
while(iter.hasNext()) {
|
|
|
|
|
score = 10;
|
|
|
|
|
final String phrase = iter.next();
|
|
|
|
|
final String phrase = iter.next();
|
|
|
|
|
if(phrases.get(phrase).size() > 3 && phrases.get(phrase).size() < 10) {
|
|
|
|
|
score = phrases.get(phrase).size() * phrase.split(" ").length * 20;
|
|
|
|
|
}
|
|
|
|
|
if(isDigitSpace(phrase)) {
|
|
|
|
|
score = 10;
|
|
|
|
|
}
|
|
|
|
|
if(phrases.get(phrase).size() > 2 && buffer.indexOf(phrase) > 1) {
|
|
|
|
|
if(phrases.get(phrase).size() > 2 && buffer.indexOf(phrase) > 1) {
|
|
|
|
|
score = score * 10;
|
|
|
|
|
}
|
|
|
|
|
if (tags.containsKey(phrase)) {
|
|
|
|
@ -127,14 +127,14 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
|
|
|
|
|
pwords.append(phrase);
|
|
|
|
|
pwords.append(' ');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// loop through potential tag and rank them
|
|
|
|
|
while(tokens.hasMoreElements()) {
|
|
|
|
|
while(tokens.hasMoreElements()) {
|
|
|
|
|
score = 0;
|
|
|
|
|
token = tokens.nextElement();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// check if the token appears in the text
|
|
|
|
|
if (words.containsKey(token.toString())) {
|
|
|
|
|
if (words.containsKey(token.toString())) {
|
|
|
|
|
final Word word = words.get(token.toString());
|
|
|
|
|
// token appears in text and matches an existing bookmark tag
|
|
|
|
|
if (tags.containsKey(token.toString())) {
|
|
|
|
@ -172,8 +172,8 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
|
|
|
|
|
} finally {
|
|
|
|
|
tokens.close();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static TreeMap<String, YMarkTag> getPhrases(final Document document, final int size) {
|
|
|
|
|
final TreeMap<String, YMarkTag> phrases = new TreeMap<String, YMarkTag>();
|
|
|
|
|
final StringBuilder phrase = new StringBuilder(128);
|
|
|
|
@ -181,33 +181,33 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
|
|
|
|
|
try {
|
|
|
|
|
StringBuilder token;
|
|
|
|
|
int count = 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// loop through text
|
|
|
|
|
while(tokens.hasMoreElements()) {
|
|
|
|
|
|
|
|
|
|
token = tokens.nextElement();
|
|
|
|
|
while(tokens.hasMoreElements()) {
|
|
|
|
|
|
|
|
|
|
token = tokens.nextElement();
|
|
|
|
|
if(stopwords.contains(token.toString()) || isDigitSpace(token.toString()))
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
// if we have a full phrase, delete the first token
|
|
|
|
|
count++;
|
|
|
|
|
if(count > size)
|
|
|
|
|
phrase.delete(0, phrase.indexOf(SPACE)+1);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// append new token
|
|
|
|
|
if(phrase.length() > 1)
|
|
|
|
|
phrase.append(SPACE);
|
|
|
|
|
phrase.append(SPACE);
|
|
|
|
|
phrase.append(token);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if(count >= size) { // make sure we really have a phrase
|
|
|
|
|
if(phrases.containsKey(phrase.toString())) {
|
|
|
|
|
phrases.get(phrase.toString()).inc();
|
|
|
|
|
} else {
|
|
|
|
|
phrases.put(phrase.toString(), new YMarkTag(phrase.toString()));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return phrases;
|
|
|
|
|
} finally {
|
|
|
|
|
tokens.close();
|
|
|
|
@ -221,7 +221,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
|
|
|
|
|
else
|
|
|
|
|
return "/IOExceptions";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public static boolean isDigitSpace(String str) {
|
|
|
|
|
if (str == null) {
|
|
|
|
|
return false;
|
|
|
|
@ -235,7 +235,8 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void run() {
|
|
|
|
|
@Override
|
|
|
|
|
public void run() {
|
|
|
|
|
Log.logInfo(YMarkTables.BOOKMARKS_LOG, "autoTagger run()");
|
|
|
|
|
Thread.currentThread().setUncaughtExceptionHandler(this);
|
|
|
|
|
String url = null;
|
|
|
|
@ -247,9 +248,9 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
|
|
|
|
|
while((url = this.bmkQueue.take()) != POISON) {
|
|
|
|
|
tagString = autoTag(url, this.loader, 5, tags);
|
|
|
|
|
if (tagString.equals("/IOExceptions")) {
|
|
|
|
|
this.ymarks.addFolder(bmk_user, url, tagString);
|
|
|
|
|
this.ymarks.addFolder(this.bmk_user, url, tagString);
|
|
|
|
|
tagString = "";
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// update tags
|
|
|
|
|
this.ymarks.addTags(this.bmk_user, url, tagString, this.merge);
|
|
|
|
|
|
|
|
|
@ -275,7 +276,8 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void uncaughtException(final Thread t, final Throwable e) {
|
|
|
|
|
@Override
|
|
|
|
|
public void uncaughtException(final Thread t, final Throwable e) {
|
|
|
|
|
Log.logWarning(YMarkTables.BOOKMARKS_LOG, "I caught an uncaughtException in thread "+t.getName());
|
|
|
|
|
Log.logException(e);
|
|
|
|
|
}
|
|
|
|
|