- bugfix for correctly sorting ymarks

- some tuning for the autotagger (still not perfect)
- /api/ymarks/get_metadata.xml now provides info for crawlstarts
- removed unused code

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8036 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
apfelmaennchen 14 years ago
parent ff32469272
commit abba31f02e

@ -1,6 +1,8 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.EnumMap;
import java.util.Iterator;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.document.Document;
import net.yacy.document.Parser.Failure;
@ -8,6 +10,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
import de.anomic.data.UserDB;
import de.anomic.data.ymark.YMarkAutoTagger;
import de.anomic.data.ymark.YMarkCrawlStart;
import de.anomic.data.ymark.YMarkEntry;
import de.anomic.data.ymark.YMarkMetadata;
import de.anomic.data.ymark.YMarkTables;
@ -27,19 +30,44 @@ public class get_metadata {
final boolean isAuthUser = user!= null && user.hasRight(UserDB.AccessRight.BOOKMARK_RIGHT);
if(isAdmin || isAuthUser) {
final String bmk_user = (isAuthUser ? user.getUserName() : YMarkTables.USER_ADMIN);
try {
final String url = post.get(YMarkEntry.BOOKMARK.URL.key());
String url = post.get(YMarkEntry.BOOKMARK.URL.key(),YMarkEntry.BOOKMARK.URL.deflt());
boolean hasProtocol = false;
for (YMarkTables.PROTOCOLS p : YMarkTables.PROTOCOLS.values()) {
if(url.toLowerCase().startsWith(p.protocol())) {
hasProtocol = true;
break;
}
}
if (!hasProtocol) {
url=YMarkTables.PROTOCOLS.HTTP.protocol(url);
}
try {
YMarkMetadata meta = new YMarkMetadata(new DigestURI(url), sb.indexSegments);
final Document document = meta.loadDocument(sb.loader);
final EnumMap<YMarkMetadata.METADATA, String> metadata = meta.loadMetadata();
prop.putXML("title", metadata.get(YMarkMetadata.METADATA.TITLE));
prop.putXML("desc", metadata.get(YMarkMetadata.METADATA.DESCRIPTION));
prop.put("keywords", putTags(document.dc_subject(','), "keywords"));
prop.put("autotags", putTags(YMarkAutoTagger.autoTag(document, 5, sb.tables.bookmarks.getTags(bmk_user)), "autotags"));
final YMarkCrawlStart crawlStart = new YMarkCrawlStart(sb.tables, url);
final Iterator<String> iter = crawlStart.keySet().iterator();
int count = 0;
String key;
while(iter.hasNext()) {
key = iter.next();
prop.putXML("crawlstart_"+count+"_key",key.toLowerCase());
prop.putXML("crawlstart_"+count+"_value",crawlStart.get(key));
count++;
}
prop.put("crawlstart", count);
} catch (MalformedURLException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();

@ -10,4 +10,9 @@
#{autotags}#<tag name="#[tag]#" />
#{/autotags}#
</autotags>
<crawlstart
#{crawlstart}##[key]#="#[value]#"
#{/crawlstart}#
>
</crawlstart>
</info>

@ -1,6 +1,5 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.EnumMap;
import java.util.Iterator;
@ -9,18 +8,20 @@ import java.util.TreeMap;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.document.Document;
import net.yacy.document.Parser.Failure;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import de.anomic.data.UserDB;
import de.anomic.data.ymark.YMarkAutoTagger;
import de.anomic.data.ymark.YMarkCrawlStart;
import de.anomic.data.ymark.YMarkEntry;
import de.anomic.data.ymark.YMarkMetadata;
import de.anomic.data.ymark.YMarkTables;
import de.anomic.data.ymark.YMarkTag;
import de.anomic.data.ymark.YMarkUtil;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -50,7 +51,7 @@ public class get_treeview {
boolean isMetadata = false;
boolean isURLdb = false;
boolean isCrawlStart = false;
boolean isWordCount = false;
boolean isAutoTagger = false;
boolean displayBmk = false;
if (post != null){
@ -73,7 +74,7 @@ public class get_treeview {
isURLdb = true;
isFolder = false;
} else if (post.get(ROOT).startsWith("w:")) {
isWordCount = true;
isAutoTagger = true;
isFolder = false;
} else if (post.get(ROOT).startsWith("c:")) {
isCrawlStart = true;
@ -192,7 +193,7 @@ public class get_treeview {
prop.put("folders_"+count+"_hash", "c:"+url);
prop.put("folders_"+count+"_hasChildren", "true");
count++;
prop.put("folders_"+count+"_foldername","<small><b>WordCounts</b></small>");
prop.put("folders_"+count+"_foldername","<small><b>AutoTagger</b></small>");
putProp(count, "meta");
prop.put("folders_"+count+"_hash", "w:"+url);
prop.put("folders_"+count+"_hasChildren", "true");
@ -205,22 +206,20 @@ public class get_treeview {
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
} else if (isWordCount || isMetadata || isURLdb || isCrawlStart) {
} else if (isAutoTagger || isMetadata || isURLdb || isCrawlStart) {
try {
final YMarkMetadata meta = new YMarkMetadata(new DigestURI(post.get(ROOT).substring(2)), sb.indexSegments);
meta.loadDocument(sb.loader);
if(isWordCount) {
final TreeMap<String,Word> words = meta.getWordCounts();
final ArrayList<String> topwords = new ArrayList<String>(words.descendingKeySet());
for(int i = 0; i < 20 && i < topwords.size(); i++) {
String word = topwords.get(i);
int occur = words.get(word).occurrences();
prop.put("folders_"+count+"_foldername","<small><b>"+word+":</b> [" + occur + "]</small>");
putProp(count, "meta");
count++;
}
count--;
prop.put("folders_"+count+"_comma", "");
final Document document = meta.loadDocument(sb.loader);
final TreeMap<String, YMarkTag> tags = sb.tables.bookmarks.getTags(bmk_user);
if(isAutoTagger) {
prop.put("folders_"+count+"_foldername","<small><b>meta-"+YMarkMetadata.METADATA.KEYWORDS.name().toLowerCase()+":</b> " + meta.loadMetadata().get(YMarkMetadata.METADATA.KEYWORDS) + "</small>");
putProp(count, "meta");
count++;
prop.put("folders_"+count+"_foldername","<small><b>with preference: </b>"+YMarkAutoTagger.autoTag(document, 4, tags)+"</small>");
putProp(count, "meta");
count++;
prop.put("folders_"+count+"_foldername","<small><b>without preference: </b>"+YMarkAutoTagger.autoTag(document, 4, new TreeMap<String, YMarkTag>())+"</small>");
putProp(count, "meta");
count++;
prop.put("folders", count);
} else if(isMetadata) {

@ -6,9 +6,14 @@ import net.yacy.kelondro.blob.Tables;
public class TablesRowComparator implements Comparator<Tables.Row> {
private String sortname;
private boolean desc;
public TablesRowComparator(final String sortname) {
public TablesRowComparator(final String sortname, final String sortorder) {
setSortName(sortname);
if(sortorder.equals("desc"))
this.desc = true;
else
this.desc = false;
}
public void setSortName(final String sortname) {
@ -20,7 +25,10 @@ public class TablesRowComparator implements Comparator<Tables.Row> {
if(row0.containsKey(this.sortname) && row1.containsKey(this.sortname)) {
String name1 = UTF8.String(row0.get(this.sortname)).toLowerCase();
String name2 = UTF8.String(row1.get(this.sortname)).toLowerCase();
return name1.compareTo(name2);
if(desc)
return name2.compareTo(name1);
else
return name1.compareTo(name2);
}
}
return 0;

@ -29,7 +29,8 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
public final static String SPACE = " ";
public final static String POISON = "";
public final static HashSet<String> stopwords = new HashSet<String>(Arrays.asList(".", "!", "?", "nbsp", "uuml", "ouml", "auml", "amp", "quot", "laquo", "raquo"));
public final static HashSet<String> stopwords = new HashSet<String>(Arrays.asList(".", "!", "?", "nbsp", "uuml", "ouml", "auml", "amp", "quot", "laquo", "raquo",
"and", "with", "the", "gt", "lt"));
private final ArrayBlockingQueue<String> bmkQueue;
@ -90,35 +91,40 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
// generate potential tags from document title, description and subject
final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32;
final StringBuilder buffer = new StringBuilder(bufferSize);
final StringBuilder pwords = new StringBuilder(1000);
buffer.append(document.dc_title().toLowerCase());
buffer.append(document.dc_description().toLowerCase());
buffer.append(document.dc_subject(' ').toLowerCase());
final Enumeration<StringBuilder> tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);
int count = 0;
int score = 0;
// get phrases
final TreeMap<String, YMarkTag> phrases = getPhrases(document, 2);
phrases.putAll(getPhrases(document, 3));
phrases.putAll(getPhrases(document, 4));
final Iterator<String> iter = phrases.keySet().iterator();
while(iter.hasNext()) {
count = 10;
score = 10;
final String phrase = iter.next();
if(phrases.get(phrase).size() > 3 && phrases.get(phrase).size() < 10) {
count = phrases.get(phrase).size() * phrase.split(" ").length * 35;
score = phrases.get(phrase).size() * phrase.split(" ").length * 20;
}
if(isDigitSpace(phrase)) {
count = 10;
score = 10;
}
if(phrases.get(phrase).size() > 2 && buffer.indexOf(phrase) > 1) {
count = count * 10;
score = score * 10;
}
topwords.add(new YMarkTag(phrase, count));
if (tags.containsKey(phrase)) {
score = score * 20;
}
topwords.add(new YMarkTag(phrase, score));
pwords.append(phrase);
pwords.append(' ');
}
// loop through potential tag and rank them
while(tokens.hasMoreElements()) {
count = 0;
score = 0;
token = tokens.nextElement();
// check if the token appears in the text
@ -126,23 +132,27 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
final Word word = words.get(token.toString());
// token appears in text and matches an existing bookmark tag
if (tags.containsKey(token.toString())) {
count = word.occurrences() * tags.get(token.toString()).size() * 200;
score = word.occurrences() * tags.get(token.toString()).size() * 200;
}
// token appears in text and has more than 3 characters
else if (token.length()>3) {
count = word.occurrences() * 100;
score = word.occurrences() * 100;
}
// if token is already part of a phrase, reduce score
if(pwords.toString().indexOf(token.toString())>1) {
score = score / 3;
}
topwords.add(new YMarkTag(token.toString(), count));
topwords.add(new YMarkTag(token.toString(), score));
}
}
count = 0;
score = 0;
buffer.setLength(0);
for(final YMarkTag tag : topwords) {
if(count < max) {
if(score < max) {
if(tag.size() > 100) {
buffer.append(tag.name());
buffer.append(YMarkUtil.TAGS_SEPARATOR);
count++;
score++;
}
} else {
break;
@ -165,7 +175,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
while(tokens.hasMoreElements()) {
token = tokens.nextElement();
if(stopwords.contains(token.toString()))
if(stopwords.contains(token.toString()) || isDigitSpace(token.toString()))
continue;
// if we have a full phrase, delete the first token

@ -52,11 +52,13 @@ public class YMarkCrawlStart extends HashMap<String,String>{
public void load(final String url) {
try {
final StringBuilder buffer = new StringBuilder(500);
buffer.append("^.*crawlingURL=\\Q");
//buffer.append("^.*crawlingURL=\\Q");
buffer.append("^crawl start for \\Q");
buffer.append(url);
buffer.append("\\E?.*");
final Pattern pattern = Pattern.compile(buffer.toString());
final Iterator<Tables.Row> APIcalls = this.worktables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_URL, pattern);
//final Iterator<Tables.Row> APIcalls = this.worktables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_URL, pattern);
final Iterator<Tables.Row> APIcalls = this.worktables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_COMMENT, pattern);
Tables.Row row = null;
while(APIcalls.hasNext()) {
row = APIcalls.next();

@ -29,19 +29,14 @@ package de.anomic.data.ymark;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.EnumMap;
import java.util.Map;
import java.util.TreeMap;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.LibraryProvider;
import net.yacy.document.Parser.Failure;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.index.Segments;
import de.anomic.crawler.retrieval.Response;
@ -141,18 +136,4 @@ public class YMarkMetadata {
}
return metadata;
}
public TreeMap<String,Word> getWordCounts() {
if (this.document != null) {
return sortWordCounts(new Condenser(this.document, true, true, LibraryProvider.dymLib).words());
}
return new TreeMap<String, Word>();
}
public static TreeMap<String,Word> sortWordCounts(final Map<String, Word> unsorted_words) {
final TreeMap<String, Word> sorted_words = new TreeMap<String, Word>(new YMarkWordCountComparator(unsorted_words));
sorted_words.putAll(unsorted_words);
return sorted_words;
}
}

@ -27,9 +27,11 @@
package de.anomic.data.ymark;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.SortedSet;
import java.util.List;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Pattern;
@ -214,17 +216,16 @@ public class YMarkTables {
return this.worktables.iterator(bmk_table, YMarkEntry.BOOKMARK.TAGS.key(), p);
}
public SortedSet<Row> orderBookmarksBy(final Iterator<Row> rowIterator, final String sortname, final String sortorder) {
final TreeSet<Row> sortTree = new TreeSet<Tables.Row>(new TablesRowComparator(sortname));
public List<Row> orderBookmarksBy(final Iterator<Row> rowIterator, final String sortname, final String sortorder) {
final List<Row> sortList = new ArrayList<Row>();
Row row;
while (rowIterator.hasNext()) {
row = rowIterator.next();
if(row != null)
sortTree.add(row);
sortList.add(row);
}
if(sortorder.equals("desc"))
return sortTree.descendingSet();
return sortTree;
Collections.sort(sortList, new TablesRowComparator(sortname, sortorder));
return sortList;
}
public void addTags(final String bmk_user, final String url, final String tagString, final boolean merge) throws IOException, RowSpaceExceededException {

@ -1,53 +0,0 @@
// YMarkWordCountComparator.java
// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany
// first published 2010 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.data.ymark;
import java.util.Comparator;
import java.util.Map;
import net.yacy.kelondro.data.word.Word;
public class YMarkWordCountComparator implements Comparator<String> {
private Map<String,Word> words;
public YMarkWordCountComparator(final Map<String,Word> words) {
this.words = words;
}
public int compare(final String k1, final String k2) {
final Word w1 = this.words.get(k1);
final Word w2 = this.words.get(k2);
if(w1.occurrences() > w2.occurrences())
return 1;
else if(w1.occurrences() < w2.occurrences())
return -1;
else
return 0;
}
}
Loading…
Cancel
Save